
    g$                     r    d Z ddlmZ ddlmZmZmZ  G d de      ZddZ G d d	e	      Z
 G d
 de      Zy)zACorpus reader for the XML version of the British National Corpus.    )concat)ElementTreeXMLCorpusReaderXMLCorpusViewc                   F    e Zd ZdZd
dZddZddZddZddZddZ	d	 Z
y)BNCCorpusReadera7  Corpus reader for the XML version of the British National Corpus.

    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.

    You can obtain the full version of the BNC corpus at
    https://www.ota.ox.ac.uk/desc/2554

    If you extracted the archive to a directory called `BNC`, then you can
    instantiate the reader as::

        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')

    c                 @    t        j                  | ||       || _        y N)r   __init___lazy)selfrootfileidslazys       K/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/bnc.pyr   zBNCCorpusReader.__init__   s      tW5
    Nc                 ,    | j                  |dd||      S )aT  
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        FN_viewsr   r   strip_spacestems       r   wordszBNCCorpusReader.words#   s     {{7E4dCCr   c                 8    |rdnd}| j                  |d|||      S )a   
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        c5posFr   r   r   r   r   r   tags         r   tagged_wordszBNCCorpusReader.tagged_words/   s$     de{{7E3TBBr   c                 ,    | j                  |dd||      S )a  
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        TNr   r   s       r   sentszBNCCorpusReader.sents?   s     {{7D$TBBr   c                 :    |rdnd}| j                  |d|||      S )a  
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        r   r   T)sentr   r   r   r   r   s         r   tagged_sentszBNCCorpusReader.tagged_sentsL   s.     de{{$C[t  
 	
r   c                     | j                   rt        n| j                  }t        | j	                  |      D cg c]  } ||||||       c}      S c c}w )zPA helper function that instantiates BNCWordViews or the list of words/sentences.)r   BNCWordView_wordsr   abspaths)r   r   r#   r   r   r   ffileids           r   r   zBNCCorpusReader._views]   sX    ::K4;; #mmG44F &$[$74
 	
s   Ac           	      8   g }t        j                  |      j                         }|j                  d      D ]  }g }	t	        |      D ]  }
|
j
                  }|sd}|s|r|j                         }|r|
j                  d|      }|dk(  r||
j                  d      f}n(|dk(  r#||
j                  d|
j                  d            f}|	j                  |        |r)|j                  t        |j                  d   |	             |j                  |	        d|vsJ |S )a  
        Helper used to implement the view methods -- returns a list of
        words or a list of sentences, optionally tagged.

        :param fileid: The name of the underlying file.
        :param bracket_sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        z.//s hwr   r   nN)r   parsegetrootfindall_all_xmlwords_intextstripgetappendBNCSentenceattribextend)r   r*   bracket_sentr   r   r   resultxmldocxmlsentr#   xmlwordwords               r   r'   zBNCCorpusReader._wordsg   s
    ""6*224~~f-GD+G4||D$::<D";;tT2D$; '++d"34DE\ '++eW[[5F"GHDD! 5 k'..*=tDEd#% .( 6!!!r   )T)NTF)NFTF)NFFTF)__name__
__module____qualname____doc__r   r   r   r!   r$   r   r'    r   r   r   r      s-     
DC C
"
#r   r   Nc                 t    |g }| D ].  }|j                   dv r|j                  |       #t        ||       0 |S )N)cw)r   r6   r2   )eltr;   childs      r   r2   r2      s?    ~99
"MM% UF+	 
 Mr   c                       e Zd ZdZd Zy)r7   z
    A list of words, augmented by an attribute ``num`` used to record
    the sentence identifier (the ``n`` attribute from the XML).
    c                 >    || _         t        j                  | |       y r
   )numlistr   )r   rL   itemss      r   r   zBNCSentence.__init__   s    dE"r   N)r@   rA   rB   rC   r   rD   r   r   r7   r7      s    
#r   r7   c                   8    e Zd ZdZh dZ	 d Zd Zd Zd Zd Z	y)	r&   zN
    A stream backed corpus view specialized for use with the BNC corpus.
    >   pbgapaligneventpauseshiftvocalunclearc                 P   |rd}nd}|| _         || _        || _        || _        d| _        d| _        d| _        d| _        t        j                  | ||       | j                          | j                  | j                  d| j                         | j                          ddi| _        y)aG  
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        z.*/sz.*/s/(.*/)?(c|w)Nz.*/teiHeader$r   rD   )_sent_tag_strip_space_stemtitleauthoreditorrespsr   r   _open
read_block_streamhandle_headerclose_tag_context)r   r*   r#   r   r   r   tagspecs          r   r   zBNCWordView.__init__   s     G(G
	'


tVW5 	

ot7I7IJ

 Gr   c                    |j                  d      }|rdj                  d |D              | _        |j                  d      }|rdj                  d |D              | _        |j                  d      }|rdj                  d |D              | _        |j                  d      }|rd	j                  d
 |D              | _        y y )NztitleStmt/title
c              3   P   K   | ]  }|j                   j                            y wr
   r3   r4   ).0r]   s     r   	<genexpr>z,BNCWordView.handle_header.<locals>.<genexpr>   s     "J6%5::#3#3#56   $&ztitleStmt/authorc              3   P   K   | ]  }|j                   j                            y wr
   rk   )rl   r^   s     r   rm   z,BNCWordView.handle_header.<locals>.<genexpr>        #NgFFKK$5$5$7grn   ztitleStmt/editorc              3   P   K   | ]  }|j                   j                            y wr
   rk   )rl   r_   s     r   rm   z,BNCWordView.handle_header.<locals>.<genexpr>   rp   rn   ztitleStmt/respStmtz

c              3   L   K   | ]  }d j                  d |D                yw)ri   c              3   P   K   | ]  }|j                   j                            y wr
   rk   )rl   resp_elts     r   rm   z6BNCWordView.handle_header.<locals>.<genexpr>.<genexpr>   s     EH(----/rn   N)join)rl   resps     r   rm   z,BNCWordView.handle_header.<locals>.<genexpr>   s$      %RW$		EEERWs   "$)r1   ru   r]   r^   r_   r`   )r   rH   contexttitlesauthorseditorsr`   s          r   rd   zBNCWordView.handle_header   s    ./"J6"JJDJ++01))#Ng#NNDK++01))#Ng#NNDK01 %RW% DJ r   c                 ^    | j                   r| j                  |      S | j                  |      S r
   )rY   handle_senthandle_word)r   rH   rw   s      r   
handle_eltzBNCWordView.handle_elt   s+    ::##C((##C((r   c                 ^   |j                   }|sd}| j                  s| j                  r|j                         }| j                  r|j	                  d|      }| j
                  dk(  r||j	                  d      f}|S | j
                  dk(  r#||j	                  d|j	                  d            f}|S )Nr,   r-   r   r   )r3   r[   r\   r4   r5   rZ   )r   rH   r?   s      r   r}   zBNCWordView.handle_word   s    xxD

::<D::774&D99#''$-(D  YY%#''%78Dr   c                 p   g }|D ]  }|j                   dv r#||D cg c]  }| j                  |       c}z  }4|j                   dv r!|j                  | j                  |             c|j                   | j                  vs|t	        d|j                   z         t        |j                  d   |      S c c}w )N)mwhicorrtrunc)rG   rF   zUnexpected element %sr.   )r   r}   r6   tags_to_ignore
ValueErrorr7   r8   )r   rH   r#   rI   rG   s        r   r|   zBNCWordView.handle_sent   s    Eyy99e<e))!,e<<j(D,,U34$"5"55 !8599!DEE  3::c?D11 =s   B3N)
r@   rA   rB   rC   r   r   rd   r~   r}   r|   rD   r   r   r&   r&      s-    	N$@()	2r   r&   r
   )rC   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   r   r   r2   rM   r7   r&   rD   r   r   <module>r      sB    H * R R|o |~#$ #f2- f2r   