
    g{c                         d Z dZddlZddlmZ ddlmZ ddlmZm	Z	 ddl
mZmZmZ dZ G d	 d
e	      ZddZedk(  r e        yy)z:
Corpus reader for the XML version of the CHILDES corpus.
z
epytext en    Ndefaultdict)concat)ElementTreeXMLCorpusReader)LazyConcatenationLazyMapflattenz#http://www.talkbank.org/ns/talkbankc                       e Zd ZdZddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZddZ	d	 Z
dd
Zd ZddZd Zd ZddZd Zd Z	 dZddZy)CHILDESCorpusReadera  
    Corpus reader for the XML version of the CHILDES corpus.
    The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
    version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
    Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
    (``nltk_data/corpora/CHILDES/``).

    For access to the file text use the usual nltk functions,
    ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
    c                 @    t        j                  | ||       || _        y N)r   __init___lazy)selfrootfileidslazys       O/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/childes.pyr   zCHILDESCorpusReader.__init__&   s      tW5
    Nc                    	
 d
d	 j                   s5 j                  |      D cg c]  } j                  |
	       c}S 	 
fd}t        t	        | j                  |                  S c c}w )a(  
        :return: the given file(s) as a list of words
        :rtype: list(str)

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of (stem, index,
            dependent_index)
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        NFc           
      4    j                  |       S r   
_get_words	fileidposrelationreplacer   sentspeakerstemstrip_spaces	    r   <lambda>z+CHILDESCorpusReader.words.<locals>.<lambda>M       4??GT43W$
r   r   abspathsr   r   r	   r   r   r!   r"   r   r#   r   r   	get_wordsr   r    s   ` `````  @@r   wordszCHILDESCorpusReader.words*   s    2 zz
 #mmG4	 5F GT43W 5	 
 
	 !DMM'4J!KLL   A>c                    	
 d
d	 j                   s5 j                  |      D cg c]  } j                  |
	       c}S 	 
fd}t        t	        | j                  |                  S c c}w )a  
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of (stem, index,
            dependent_index)
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        NTc           
      4    j                  |       S r   r   r   s	    r   r$   z2CHILDESCorpusReader.tagged_words.<locals>.<lambda>w   r%   r   r&   r(   s   ` `````  @@r   tagged_wordsz CHILDESCorpusReader.tagged_wordsR       6 zz
 #mmG4	 5F GT43W 5	 
 
	 !DMM'4J!KLLr+   c                    	
 d
d	 j                   s5 j                  |      D cg c]  } j                  |
	       c}S 	 
fd}t        t	        | j                  |                  S c c}w )a  
        :return: the given file(s) as a list of sentences or utterances, each
            encoded as a list of word strings.
        :rtype: list(list(str))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
            If there is manually-annotated relation info, it will return
            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        TFc           
      4    j                  |       S r   r   r   s	    r   r$   z+CHILDESCorpusReader.sents.<locals>.<lambda>   r%   r   r&   r(   s   ` `````  @@r   sentszCHILDESCorpusReader.sents|   s    6 zz
 #mmG4	 5F GT43W 5	 
 
	 !DMM'4J!KLLr+   c                    	
 d
d	 j                   s5 j                  |      D cg c]  } j                  |
	       c}S 	 
fd}t        t	        | j                  |                  S c c}w )a  
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
            If there is manually-annotated relation info, it will return
            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        Tc           
      4    j                  |       S r   r   r   s	    r   r$   z2CHILDESCorpusReader.tagged_sents.<locals>.<lambda>   r%   r   r&   r(   s   ` `````  @@r   tagged_sentsz CHILDESCorpusReader.tagged_sents   r/   r+   c                     | j                   s.| j                  |      D cg c]  }| j                  |       c}S t        | j                  | j                  |            S c c}w )zu
        :return: the given file(s) as a dict of ``(corpus_property_key, value)``
        :rtype: list(dict)
        )r   r'   _get_corpusr	   r   r   r   s      r   corpuszCHILDESCorpusReader.corpus   sX    
 zz;?==;QR;QD$$V,;QRRt''w)?@@ S   A c                     t               }t        j                  |      j                         }|j	                         D ]
  \  }}|||<    |S r   )dictr   parsegetrootitems)r   r   resultsxmldockeyvalues         r   r7   zCHILDESCorpusReader._get_corpus   sD    &""6*224 ,,.JC GCL )r   c                     | j                   s.| j                  |      D cg c]  }| j                  |       c}S t        | j                  | j                  |            S c c}w )z
        :return: the given file(s) as a dict of
            ``(participant_property_key, value)``
        :rtype: list(dict)
        )r   r'   _get_participantsr	   r8   s      r   participantsz CHILDESCorpusReader.participants   sX     zzAEwAWXAWvD**62AWXXt--t}}W/EFF Yr:   c                    fdt        j                  |      j                         }        }|j                  dt         dt         d      D ]1  }|j                         D ]  \  }}|||j                  d         |<    3 |S )Nc                      t               S r   r   )dictOfDictss   r   rI   z:CHILDESCorpusReader._get_participants.<locals>.dictOfDicts   s    {++r   .//{}Participants/{}participantid)r   r=   r>   findallNSr?   get)r   r   rA   patparticipantrB   rC   rI   s          @r   rE   z%CHILDESCorpusReader._get_participants   s    	, ""6*224m!>>B4(M:
K *//1
U27KOOD)*3/ 2

 
r   c                       j                   s0 j                  |      D cg c]  } j                  |       c}S  fd}t        | j                  |            S c c}w )z
        :return: the given file(s) as string or int
        :rtype: list or int

        :param month: If true, return months instead of year-month-date
        c                 *    j                  |       S r   )_get_age)r   monthr   r!   s    r   r$   z)CHILDESCorpusReader.age.<locals>.<lambda>  s    vw!Fr   )r   r'   rU   r	   )r   r   r!   rV   r   get_ages   ` ``  r   agezCHILDESCorpusReader.age   sg     zz #mmG44F fgu54  Gwg 677s   A"c                 H   t        j                  |      j                         }|j                  dt         dt         d      D ]?  }	 |j                  d      |k(  r(|j                  d      }|r| j                  |      }|c S A y # t        t        f$ r}Y d }~ y d }~ww xY w)NrJ   rK   rL   rM   rX   )	r   r=   r>   rN   rO   rP   convert_age	TypeErrorAttributeError)r   r   r!   rV   rA   rQ   rX   es           r   rU   zCHILDESCorpusReader._get_age  s    ""6*224>>E"->rd-"PQC774=G+''%.C"..s3J	 , R ~. s   9BB!B!c                    t        j                  d|      }t        |j                  d            dz  t        |j                  d            z   }	 t        |j                  d            dkD  r|dz  }|S # t        $ r}Y d}~|S d}~ww xY w)z8Caclculate age in months from a string in CHILDES formatzP(\d+)Y(\d+)M?(\d?\d?)D?               N)rematchintgroup
ValueError)r   age_yearm	age_monthr]   s        r   rZ   zCHILDESCorpusReader.convert_age  s    HH0(;
Ob(3qwwqz?:		1771:#Q	   		s   "A4 4	BBc                       j                   s0 j                  |      D cg c]  } j                  |       c}S  fd}t        | j                  |            S c c}w )z]
        :return: the given file(s) as a floating number
        :rtype: list(float)
        r!   c                 *    j                  |       S )Nrm   )_getMLU)r   r   r!   s    r   r$   z)CHILDESCorpusReader.MLU.<locals>.<lambda>+  s    fg!Fr   )r   r'   ro   r	   )r   r   r!   r   get_MLUs   ` `  r   MLUzCHILDESCorpusReader.MLU!  sg    
 zz #mmG44F VW54  Gwg 677s   A c           
         | j                  ||dddddd      }g }g }d}d}|D ]  }|D 	
cg c]  \  }	}
|
	 }}	}
t        d |D              r*|g k(  r0||k(  r6|j                  |D 	
cg c]  \  }	}
|		 c}
}	       t        dd hj	                  |            dkD  r-||j                  d      z  }||j                  d       z  }|dz  }|} 	 t        |      }t        t        |D 	cg c]  }	|	j                  d       c}	            |z
  }t        |      |z
  }||z  }|S c c}
}	w c c}
}	w c c}	w # t        $ r d}Y |S w xY w)	NTF)r!   r    r"   r   r   r#   r   r   c              3   &   K   | ]	  }|d k(    yw)unkN ).0r   s     r   	<genexpr>z.CHILDESCorpusReader._getMLU.<locals>.<genexpr>@  s     37C3%<7s   cor_   -)	r   anyappendlenintersectioncountr
   splitZeroDivisionError)r   r   r!   r2   r@   lastSent
numFillerssentDiscountr    wordr   posListthisWordListnumWordsnumSentsmlus                   r   ro   zCHILDESCorpusReader._getMLU.  s      	
 
D.23d{csdG33733!=$=>d|009:Q>'--"55J'--"55J A%LH% &
	"7+L GFTZZ_FGH:U  7|l2HX%C 
= 4  > G ! 	C
	s/   D#*D)D4 (D/ !D4 /D4 4EEc	                 
   t        |t              r|dk7  r|g}t        j                  |      j	                         }	g }
|	j                  dt        z        D ]  }g }|dk(  s|j                  d      |v s|j                  dt        z        D ]@  }d }d }d }|rH|j                  dt         dt         d      r(|j                  dt         dt         dt         d	      }nB|r@|j                  dt         dt         d
      r |j                  dt         dt         d
      }|j                  r|j                  }nd}|r|j                         }|s|r	 |j                  dt        z        }|j                  }	 |j                  dt         dt         dt         d      }|d|j                  z   z  }	 |j                  dt        dt        dt        dt        d	      }|j                  }|r|d|z   z  }|s|r	 |j                  dt        z        }|j                  dt        z        }|g k7  r#|d   j                  dz   |d   j                  z   }n|d   j                  }	 |j                  dt        dt        dt        dt        dt        d      }|j                  dt        dt        dt        dt        dt        d      }|r#|d   j                  dz   |d   j                  z   }n|d   j                  }|r|d|z   z  }||f}|dk(  r|j                  dt         dt         d      D ]  }|j                  d      dk(  sE|d   |d   |j                  d       d!z   |j                  d"      z   d!z   |j                  d#      z   f}\|d   |d   |d$   |d   |d   |j                  d       d!z   |j                  d"      z   d!z   |j                  d#      z   f} 	 |j                  dt         dt         dt         d      D ]  }|j                  d      dk(  sE|d   |d   |j                  d       d!z   |j                  d"      z   d!z   |j                  d#      z   f}\|d   |d   |d$   |d   |d   |j                  d       d!z   |j                  d"      z   d!z   |j                  d#      z   f} 	 |j                  |       C |s|r|
j                  |       |
j                  |        t        d% |
      S # t        $ r}Y d }~d }~ww xY w#  Y rxY w# t        $ r d}Y Hw xY w# t        t        f$ r}d}Y d }~d }~ww xY w#  Y ZxY w#  Y xY w)&NALLz.//{%s}uwhoz.//{%s}wrJ   z}w/{z}replacementz}replacement/{z}wz}wk z.//{%s}stemz}mor/{z}mw/{z}mkry   z}mor-post/{z}stem~z.//{%s}cz.//{%s}sr   :z}pos/{z}cz}sTz}gratypegrtr_   index|headr   ra   c                     | S r   ru   )xs    r   r$   z0CHILDESCorpusReader._get_words.<locals>.<lambda>  s    r   )
isinstancestrr   r=   r>   rN   rO   rP   findtextstripr\   
IndexErrorr{   extendr	   )r   r   r!   r    r"   r   r   r#   r   rA   r@   xmlsentr2   xmlwordinfl
suffixStem	suffixTagr   xmlstemr]   xmlinfl	xmlsuffixxmlposxmlpos2tagxmlsuffixposxmlsuffixpos2xmlstem_relxmlpost_rels                                r   r   zCHILDESCorpusReader._get_words^  s    w$E)9iG""6*224~~j2o6GE%7;;u#5#@&zB?GD!%J $I7<<%t6"]0S#T"),,#B4vbT1A"SI# !W\\E"VB4t2L%M"),,rd&D/I"J||&||!"#zz|4!&-ll=23E&FG#*<<D!&-ll"'t8B4wrd$ G'G !C',,$66D,(/#%r2r!3)I *3J & C*$44D3%%,__Z"_%EF&-ooj2o&FG&"}&,Qinns&:WQZ__&L&,Qinn!+2??#%r2r2!7,L -4OO#%r2r2!7-M  -$0O$8$83$>qAQAVAV$V !* -9O,@,@	 %3?2C $c{  4'+2??#B4xt59,K $/??6#:e#C$(G$(G$/OOG$<&)%*&1oof&=%> '*%* '2ooj&A	%B(" %)G$(G$(G$(G$(G$/OOG$<&)%*&1oof&=%> '*%* '2ooj&A	%B(",4!/6"'t8B4}RD N0 (3v'>%'G(21(21(3(@*-).*5//&*A)B +.). +6//**E	)F2&J )31(21(21(21(21(3(@*-).*5//&*A)B +.). +6//**E	)F2&J08 LL&O  @P 8NN5)NN5)_ 7` {G,,i  . ! !!   . ,)+J, !/
; %"$C%"! z! sc   $T	:9T 4:T(>A'T:&BU'CU!		TT T%(T76T7:U	UUU!U%z3https://childes.talkbank.org/browser/index.php?url=c                    ddl }|r	|dz   |z   }n| j                  dz   |z   }t        j                  dd|      }d|j	                         v rt        j
                  d|      d   }n1d|j	                         v rdt        j
                  d	|      d   z   }n|}|j                  d
      r|dd }|j                  d      s|dz   }| j                  |z   }|j                  |       t        d|       y)a  Map a corpus file to its web version on the CHILDES website,
        and open it in a web browser.

        The complete URL to be used is:
            childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')

        If no urlbase is passed, we try to calculate it.  This
        requires that the childes corpus was set up to mirror the
        folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
        nltk_data/corpora/childes/Eng-USA/Cornell/??? or
        nltk_data/corpora/childes/Romance/Spanish/Aguirre/???

        The function first looks (as a special case) if "Eng-USA" is
        on the path consisting of <corpus root>+fileid; then if
        "childes", possibly followed by "data-xml", appears. If neither
        one is found, we use the unmodified fileid and hope for the best.
        If this is not right, specify urlbase explicitly, e.g., if the
        corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
        r   N/z\\z	/childes/z$(?i)/childes(?:/data-xml)?/(.*)\.xmlzeng-usazEng-USA/z/(?i)Eng-USA/(.*)\.xmlz.xmlz.chazOpening in browser:)

webbrowserr   rd   sublowerrN   endswithchildes_url_baseopen_new_tabprint)r   r   urlbaser   pathfullurls          r   webview_filez CHILDESCorpusReader.webview_file  s    * 	S=6)D99s?V+D66%d+Ddjjl*zz"I4PQRSdjjl*!BJJ/H$$OPQ$RR == 9D}}V$&=D##d*$#S)r   )T)Nr   FFTF)Nr   FNTFr   )NCHIF)Nr   )__name__
__module____qualname____doc__r   r*   r.   r2   r5   r9   r7   rF   rE   rX   rU   rZ   rq   ro   r   r   r   ru   r   r   r   r      s    	 &MT (MX (MX (MTAG8
8.`Z-|
 N.*r   r   c           
      >   | sddl m}  |d      } 	 t        | d      }|j                         dd D ]H  }d}d}|j	                  |      d   j                         D ]  \  }}|dk(  r|}|d	k(  s|} t        d
||d       t        d|j                  |      dd d       t        d|j                  |d      dd d       t        d|j                  |      dd d       t        d|j                  |d      dd d       t        d|j                  |d      dd d       t        d|j                  |d      dd d       t        d|j                  |d      dd d       t        d|j                  |      dd d       |j                  |      d   j                         D ],  \  }}	|	j                         D ]  \  }}t        d||d |        . t        d!t        |j                  |                   t        d"t        |j                  |d                   t        d#|j                  |             t        d$|j                  |d%             t        d&|j                  |             t                K y# t        $ r}
t        d'       Y d}
~
yd}
~
ww xY w)(zp
    The CHILDES corpus should be manually downloaded and saved
    to ``[NLTK_Data_Dir]/corpora/childes/``
    r   )r   z!corpora/childes/data-xml/Eng-USA/z.*.xmlN   r   CorpusIdReadingz .....zwords:   z...zwords with replaced words:T)r   z ...zwords with pos tags:zwords (only MOT):MOTrm   zwords (only CHI):r   zstemmed words:)r"   z!words with relations and pos-tag:)r   z	sentence:ra   z	participantr   znum of sent:znum of morphemes:zage:zage in month:)rV   zMLU:aU  The CHILDES corpus, or the parts you need, should be manually
        downloaded from https://childes.talkbank.org/data-xml/ and saved at
        [NLTK_Data_Dir]/corpora/childes/
            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
        demo('/path/to/childes/data-xml/Eng-USA/")
        )	nltk.datar   r   r   r9   r?   r   r*   r.   r2   rF   r|   rX   rq   LookupError)corpus_rootr   childesfiler9   	corpus_idrB   rC   rR   valuesr]   s              r   demor   6  s   
 ">?.
%k8<OO%bq)DFI%nnT215;;=
U(?"F$; %I	 >
 )VY9(GMM$/3U;,dD1"15
 ('*>*>t*DRa*H&Q%w}}T5}'I"1'MuU%w}}T5}'I"1'MuU"GMM$TM$B2A$FO3dT22A6
 +w}}T22A6?'.';';D'A!'D'J'J'L#V"(,,.JC/;S%H #1 (M .#gmmD&9":;%s7==D=+I'JK&'++d+,/7;;t4;#@A&'++d+,GC *F  
	
 	

s   AI> .HI> >	JJJ__main__r   )r   __docformat__rd   collectionsr   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   	nltk.utilr   r	   r
   rO   r   r   r   ru   r   r   <module>r      sS     	 # * C 9 9 +W*/ W*x8
~ zF r   