
    g&4                     V    d Z ddlmZ 	 d
dZ	 ddZddZd Zedk(  r e        g d	Zy)a  
A collection of methods for tree (grammar) transformations used
in parsing natural language.

Although many of these methods are technically grammar transformations
(ie. Chomsky Norm Form), when working with treebanks it is much more
natural to visualize these modifications in a tree structure.  Hence,
we will do all transformation directly to the tree itself.
Transforming the tree directly also allows us to do parent annotation.
A grammar can then be simply induced from the modified tree.

The following is a short tutorial on the available transformations.

 1. Chomsky Normal Form (binarization)

    It is well known that any grammar has a Chomsky Normal Form (CNF)
    equivalent grammar where CNF is defined by every production having
    either two non-terminals or one terminal on its right hand side.
    When we have hierarchically structured data (ie. a treebank), it is
    natural to view this in terms of productions where the root of every
    subtree is the head (left hand side) of the production and all of
    its children are the right hand side constituents.  In order to
    convert a tree into CNF, we simply need to ensure that every subtree
    has either two subtrees as children (binarization), or one leaf node
    (non-terminal).  In order to binarize a subtree with more than two
    children, we must introduce artificial nodes.

    There are two popular methods to convert a tree into CNF: left
    factoring and right factoring.  The following example demonstrates
    the difference between them.  Example::

     Original       Right-Factored     Left-Factored

          A              A                      A
        / | \          /   \                  /   \
       B  C  D   ==>  B    A|<C-D>   OR   A|<B-C>  D
                            /  \          /  \
                           C    D        B    C

 2. Parent Annotation

    In addition to binarizing the tree, there are two standard
    modifications to node labels we can do in the same traversal: parent
    annotation and Markov order-N smoothing (or sibling smoothing).

    The purpose of parent annotation is to refine the probabilities of
    productions by adding a small amount of context.  With this simple
    addition, a CYK (inside-outside, dynamic programming chart parse)
    can improve from 74% to 79% accuracy.  A natural generalization from
    parent annotation is to grandparent annotation and beyond.  The
    tradeoff becomes accuracy gain vs. computational complexity.  We
    must also keep in mind data sparcity issues.  Example::

     Original       Parent Annotation

          A                A^<?>
        / | \             /   \
       B  C  D   ==>  B^<A>    A|<C-D>^<?>     where ? is the
                                 /  \          parent of A
                             C^<A>   D^<A>


 3. Markov order-N smoothing

    Markov smoothing combats data sparcity issues as well as decreasing
    computational requirements by limiting the number of children
    included in artificial nodes.  In practice, most people use an order
    2 grammar.  Example::

      Original       No Smoothing       Markov order 1   Markov order 2   etc.

       __A__            A                      A                A
      / /|\ \         /   \                  /   \            /   \
     B C D E F  ==>  B    A|<C-D-E-F>  ==>  B   A|<C>  ==>   B  A|<C-D>
                            /   \               /   \            /   \
                           C    ...            C    ...         C    ...



    Annotation decisions can be thought about in the vertical direction
    (parent, grandparent, etc) and the horizontal direction (number of
    siblings to keep).  Parameters to the following functions specify
    these values.  For more information see:

    Dan Klein and Chris Manning (2003) "Accurate Unlexicalized
    Parsing", ACL-03.  https://www.aclweb.org/anthology/P03-1054

 4. Unary Collapsing

    Collapse unary productions (ie. subtrees with a single child) into a
    new non-terminal (Tree node).  This is useful when working with
    algorithms that do not allow unary productions, yet you do not wish
    to lose the parent information.  Example::

       A
       |
       B   ==>   A+B
      / \        / \
     C   D      C   D

    TreeNc                 4   |d}| | j                         gfg}|g k7  r|j                         \  }}t        |t              rd}	|j                         }
|dk7  rg|| k7  rbt        |d   t              rOdj	                  |dj                  |            }	|j                  |j                         |	z          |
g|d |dz
   z   }|D ]  }|j                  ||f        t        |      dkD  r|D cg c]  }|j                          }}|j                         }g |dd  |}t        |      }t        d|dz
        D ]  }|dk(  rWd	j	                  |
|dj                  ||t        ||z   |g             |	      }t        |g       }|j                  d      |g|dd  nYd	j	                  |
|dj                  |t        ||z
  |z
  dg      |        |	      }t        |g       }||j                         g|dd  |} |D cg c]  }| c}|dd  |g k7  ry y c c}w c c}w )
Ni   r   z{}<{}>-      rightz
{}{}<{}>{})labelpop
isinstancer   formatjoin	set_labelappendlencopyrangeminmax)treefactor
horzMarkov
vertMarkov	childChar
parentCharnodeListnodeparentparentStringoriginalNodechild
childNodesnodeCopycurNodenumChildreninewHeadnewNodes                      I/var/www/openai/venv/lib/python3.12/site-packages/nltk/tree/transforms.pychomsky_normal_formr+   q   sC    
 

~&'H
b.||~fdD!L::<LQ44<JtAw4M'z388F;KLtzz|l:;&&1A:>*BB 0  4y1}9=>ekkm
>99;QR!(mq+/2A("."5"5(%HH *1sA
NK3P/Q R )# #'w"3'/||A&@"."5"5(%HH *3a*0La/P+QUVTV W )# #'w"3'.&?%G1 34 3;;(u(;c b." ?@ <s   .H8	Hc                 d   | g fg}|g k7  r|j                         \  }}t        |t              rt|j                         j	                  |      }|dk7  rq|j                  |      }	|j                  ||	          |	dk(  r+|j                  d|d          |j                  d|d          n|j                  |d   |d   g       |}n|j                         j	                  |      }
|
dk7  r"|j                  |j                         d |
        |dk(  rz|j                         j	                  |      }|dk7  rVt        |j                         |dz   d  |D cg c]  }| c}      }|j                  |j                         d |        |g|dd  |D ]  }|j                  ||f        |g k7  ry y c c}w )Nr   r   T)r   r   r   r   findindexremoveinsertextendr   r   )r   expandUnaryr   r   	unaryCharr   r   r   
childIndex	nodeIndexparentIndex
unaryIndexr'   r)   r"   s                  r*   un_chomsky_normal_formr9      s    r
|H
b.||~fdD! **95JR"LL.	fY/0 >MM!T!W-MM!T!W-MM47DG"45 "jjl//
;"$NN4::<#=> $&!%!2!29!=J!R'"& JJLa)9:<M1Q<M# tzz|KZ'@A$+9QR. K b.@ =Ns   	F-
c                    |dk(  r%t        | t              rt        |       dk(  r| d   g}n| g}|g k7  r|j                         }t        |t              rt        |      dk(  rt        |d   t              rv|dk(  st        |d   t              r^|j	                  |j                         |z   |d   j                         z          |d   D cg c]  }| c}|dd |j                  |       n|D ]  }|j                  |        |g k7  ryyc c}w )a  
    Collapse subtrees with a single child (ie. unary productions)
    into a new non-terminal (Tree node) joined by 'joinChar'.
    This is useful when working with algorithms that do not allow
    unary productions, and completely removing the unary productions
    would require loss of useful information.  The Tree is modified
    directly (since it is passed by reference) and no value is returned.

    :param tree: The Tree to be collapsed
    :type  tree: Tree
    :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
                        Part-of-Speech tags) since they are always unary productions
    :type  collapsePOS: bool
    :param collapseRoot: 'False' (default) will not modify the root production
                         if it is unary.  For the Penn WSJ treebank corpus, this corresponds
                         to the TOP -> productions.
    :type collapseRoot: bool
    :param joinChar: A string used to connect collapsed node values (default = "+")
    :type  joinChar: str
    Fr   r   T)r   r   N)r   r   r   r   r   r   r   )r   collapsePOScollapseRootjoinCharr   r   r"   s          r*   collapse_unaryr>      s    , uD$!7CING96 b.||~dD!D	QtAw- D(JtDz4,Htzz|h6aHI/3Aw7weEw7QR %!EOOE* " b. 8s   		D
c                  t   ddl m}  ddlm} ddlm} d} |j                  |d      } | |      }t        |        | |      }t        |        | |      }t        |dd	
        | |      }t        |       |j                         }	t        |       t        |	       t        d||	k(          ||||||       y)zF
    A demonstration showing how each tree transform can be used.
    r   )deepcopy)
draw_treesr   aX  (TOP
  (S
    (S
      (VP
        (VBN Turned)
        (ADVP (RB loose))
        (PP
          (IN in)
          (NP
            (NP (NNP Shane) (NNP Longman) (POS 's))
            (NN trading)
            (NN room)))))
    (, ,)
    (NP (DT the) (NN yuppie) (NNS dealers))
    (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
    (. .)))T)remove_empty_top_bracketingr	   r   )r   r   zSentences the same? N)r   r@   nltk.draw.treerA   nltk.tree.treer   
fromstringr>   r+   r9   pprintprint)
r@   rA   r   sentencetcollapsedTreecnfTree
parentTreeoriginal	sentence2s
             r*   demorO     s    
 )#H  	dCA QKM=! }%G  -(J
qQ? 
#H8$ !I	(O	)	
 (i"78q-*h?    __main__)r+   r9   r>   )r
   Nr   |^)TrR   rS   +)FFrT   )	__doc__rD   r   r+   r9   r>   rO   __name____all__ rP   r*   <module>rY      sM   dL   TWC<N FI+/\++f3@l zF
MrP   