
    g-                        d dl mZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZ d Zd Z G d	 d
      Z G d de      Z G d de      Z edd      Z edd      Z edd      Z G d de      Z G d dee      Zy)    )
namedtuple)partialwraps)CategorizedCorpusReader)PlaintextCorpusReader)concatread_blankline_block)blankline_tokenizesent_tokenizeword_tokenizec                 .     t                fd       }|S )z
    A decorator that allows a function to be called with
    a single string of comma-separated values which become
    individual function arguments.
    c                     t               }| D ]  }t        |t              r=|j                  |j	                  d      D ch c]  }|j                          c}       Pt        |t               r|j                  t        |             {|j                  |        |j                         D ]F  \  }}t        |t              s|j	                  d      D ch c]  }|j                          c}||<   H  |i |S c c}w c c}w )N,)list
isinstancestrappendsplitstripsetitems)argskwargs_argsargpartnamevaluefuncs          P/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/markdown.pywrapperz,comma_separated_string_args.<locals>.wrapper   s    C#s#syy~F~tdjjl~FGC&SX&S!  "<<>KD%%%9>S9IJ9I

9IJt * U%f%% G  Ks   C=
D)r   )r   r!   s   ` r    comma_separated_string_argsr"   
   s"     4[& & N    c                 J    t        |       }|r|j                  |d         gS |S Nr   )r	   render)streamparserblocks      r    read_parse_blankline_blockr*   #   s*     (EeAh'((Lr#   c                   ^    e Zd Zd Zd Zd Zed        Zed        Zed        Z	ed        Z
y)	MarkdownBlockc                      || _         d| _        y )N   )contenttruncate_at)selfr/   s     r    __init__zMarkdownBlock.__init__+   s    r#   c                 ^    | j                   j                   dt        t        |              dS )Nz	(content=))	__class____name__reprr   r1   s    r    __repr__zMarkdownBlock.__repr__/   s)    ..))*)DTO3DAFFr#   c                     | j                   d | j                    t        | j                         | j                  kD  rd S d S )Nz... )r/   r0   lenr8   s    r    __str__zMarkdownBlock.__str__2   sR    ||-T--./DLL)D,<,<<uEG	
BDEG	
r#   c                     | j                   S N)r/   r8   s    r    rawzMarkdownBlock.raw8   s    ||r#   c                 ,    t        | j                        S r?   )r   r/   r8   s    r    wordszMarkdownBlock.words<   s    T\\**r#   c                 d    t        | j                        D cg c]  }t        |       c}S c c}w r?   )r   r/   r   )r1   sents     r    sentszMarkdownBlock.sents@   s*    0=dll0KL0Kd#0KLLLs   -c           
          t        | j                        D cg c]$  }t        |      D cg c]  }t        |       c}& c}}S c c}w c c}}w r?   )r
   r/   r   r   )r1   pararD   s      r    paraszMarkdownBlock.parasD   sP     +4<<8
8 .;4-@A-@T]4 -@A8
 	
A
s   AAAAN)r6   
__module____qualname__r2   r9   r=   propertyr@   rB   rE   rH    r#   r    r,   r,   *   sd    G
   + + M M 
 
r#   r,   c                   N     e Zd Z fdZed        Zed        Zed        Z xZS )	CodeBlockc                 ,    || _         t        |   |  y r?   )languagesuperr2   )r1   rP   r   r5   s      r    r2   zCodeBlock.__init__M   s     $r#   c                 n    | j                   j                         D cg c]  }t        |       c}S c c}w r?   )r/   
splitlinesr   )r1   lines     r    rE   zCodeBlock.sentsQ   s.    040G0G0IJ0Id#0IJJJs   2c                 6    | j                   j                         S r?   )r/   rS   r8   s    r    lineszCodeBlock.linesU   s    ||&&((r#   c           
          t        | j                        D cg c])  }|j                         D cg c]  }t        |       c}+ c}}S c c}w c c}}w r?   )r
   r/   rS   r   )r1   rG   rT   s      r    rH   zCodeBlock.parasY   sR     +4<<8
8 .2__->?->T]4 ->?8
 	
?
s   AAAA)	r6   rI   rJ   r2   rK   rE   rV   rH   __classcell__r5   s   @r    rN   rN   L   sF      K K ) ) 
 
r#   rN   c                        e Zd Z fdZ xZS )MarkdownSectionc                 :    || _         || _        t        |   |  y r?   )headinglevelrQ   r2   )r1   r]   r^   r   r5   s       r    r2   zMarkdownSection.__init__b   s    
$r#   )r6   rI   rJ   r2   rX   rY   s   @r    r[   r[   a   s       r#   r[   Imagezlabel, src, titleLinkzlabel, href, titleListzis_ordered, itemsc                   *     e Zd Zdd fd
Zd Z xZS )MarkdownCorpusReaderNr(   c                   ddl m} ddlm} ddlm} || _        | j                  * |d|      | _        | j                  j                  |       |j                  dt        t        | j                               t        | 0  |i | y )	Nr   )
MarkdownIt)RendererPlain)front_matter_plugin
commonmark)renderer_clspara_block_readerrd   )markdown_itrf   mdit_plain.rendererrg   mdit_py_plugins.front_matterrh   r(   use
setdefaultr   r*   rQ   r2   )r1   r(   r   r   rf   rg   rh   r5   s          r    r2   zMarkdownCorpusReader.__init__n   so    *5D;;$\NDKKKOO/0)CDKK!X	
 	$)&)r#   c                     t               }| j                  |      D ],  }|j                  | j                  j	                  |             . |S r?   )r   _para_block_readerextend_word_tokenizertokenize)r1   r'   rB   rG   s       r    _read_word_blockz%MarkdownCorpusReader._read_word_block~   s@    ++F3DLL--66t<= 4r#   )r6   rI   rJ   r2   rv   rX   rY   s   @r    rc   rc   m   s    %) * r#   rc   c                   V    e Zd ZdZdddZed fd	       Zed fd	       Zed fd	       Zed fd	       Z	ed fd		       Z
ed fd
	       Zd Zd Zedd       Zd Zedd       Zd Zedd       Zd Zedd       Zd Zedd       Zd Zedd       Zd Zedd       Z xZS )CategorizedMarkdownCorpusReadera  
    A reader for markdown corpora whose documents are divided into
    categories based on their file identifiers.

    Based on nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader:
    https://www.nltk.org/_modules/nltk/corpus/reader/api.html#CategorizedCorpusReader
    tags)	cat_fieldc                p   g d}t        fd|D              st               d<   t        j                  |        t	        j                  | g|i  | j
                  U| j
                  sH| j                  D ]8  }| j                  |      }|s|d   j                  |g       | j
                  |<   : yyy)a  
        Initialize the corpus reader. Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``MarkdownCorpusReader`` constructor.
        )cat_patterncat_mapcat_filec              3   &   K   | ]  }|v  
 y wr?   rL   ).0r   r   s     r    	<genexpr>z;CategorizedMarkdownCorpusReader.__init__.<locals>.<genexpr>   s     5HS3&=Hs   r}   Nr   )	anydictr   r2   rc   _map_fileidsmetadataget)r1   rz   r   r   cat_argsfile_idr   s      `   r    r2   z(CategorizedMarkdownCorpusReader.__init__   s     :5H55 !%F9((v6%%d<T<V< 99 ====1)1!B)GDIIg& ) *3 r#   c                 "    t         |   |      S r?   )rQ   
categories)r1   fileidsr5   s     r    r   z*CategorizedMarkdownCorpusReader.categories   s    w!'**r#   c                 >    || j                   S t        | 	  |      S r?   )r   rQ   r   )r1   r   r5   s     r    r   z'CategorizedMarkdownCorpusReader.fileids   s"    == wz**r#   c                 B    t         |   | j                  ||            S r?   )rQ   r@   _resolver1   r   r   r5   s      r    r@   z#CategorizedMarkdownCorpusReader.raw   s    w{4==*=>>r#   c                 B    t         |   | j                  ||            S r?   )rQ   rB   r   r   s      r    rB   z%CategorizedMarkdownCorpusReader.words       w}T]]7J?@@r#   c                 B    t         |   | j                  ||            S r?   )rQ   rE   r   r   s      r    rE   z%CategorizedMarkdownCorpusReader.sents   r   r#   c                 B    t         |   | j                  ||            S r?   )rQ   rH   r   r   s      r    rH   z%CategorizedMarkdownCorpusReader.paras   r   r#   c                     t        | j                  | j                  ||      d      D cg c]  \  }}| j                  |||       c}}      S c c}}w )NT)include_encoding)encoding)r   abspathsr   
CorpusView)r1   readerr   r   pathencs         r    concatenated_viewz1CategorizedMarkdownCorpusReader.concatenated_view   sf     $(==MM':6 $1 $$KT3 fs;$
 	
s   A
c                     ddl m} | j                  j                  |j	                               D cg c]#  }|j
                  dk(  r ||j                        % c}S c c}w )Nr   )	safe_loadfront_matter)yamlr   r(   parsereadtyper/   )r1   r'   r   ts       r    metadata_readerz/CategorizedMarkdownCorpusReader.metadata_reader   sV    " [[&&v{{}5
5vv' aii 5
 	
 
s   (Ac                 <    | j                  | j                  ||      S r?   )r   r   r1   r   r   s      r    r   z(CategorizedMarkdownCorpusReader.metadata   s    %%d&:&:GZPPr#   c           
         | j                   j                  |j                               }t        d |      }t        d |      }t	               }t        ||      D ]?  \  }}|j                  |      }|j                  ||      }	|j                  |||	dz           A |D 
cg c]G  }
t        | j                   j                  j                  |
| j                   j                  d             I c}
S c c}
w )Nc                 B    | j                   dk(  xr | j                  dk(  S )Nr   blockquote_openr^   r   r   s    r    <lambda>zCCategorizedMarkdownCorpusReader.blockquote_reader.<locals>.<lambda>   s    agglBqvv1B'BBr#   c                 B    | j                   dk(  xr | j                  dk(  S )Nr   blockquote_closer   r   s    r    r   zCCategorizedMarkdownCorpusReader.blockquote_reader.<locals>.<lambda>   s    agglCqvv1C'CCr#      env)r(   r   r   filterr   zipindexr   r,   rendererr&   options)r1   r'   tokensopening_tokensclosing_tokensblockquotesocopening_indexclosing_indexr)   s              r    blockquote_readerz1CategorizedMarkdownCorpusReader.blockquote_reader   s    ""6;;=1BF
  CV
 f7DAq"LLOM"LLM:Mvmma6GHI 8 %	
 % $$++E4;;3F3FD+Q %	
 	
 
s    AC/c                 <    | j                  | j                  ||      S r?   )r   r   r   s      r    r   z+CategorizedMarkdownCorpusReader.blockquotes       %%d&<&<gzRRr#   c                     | j                   j                  |j                               D cg c]?  }|j                  dk(  r.|j                  dv r t        |j                  |j                        A c}S c c}w )Nr   )fence
code_block)r(   r   r   r^   r   rN   infor/   )r1   r'   r   s      r    code_block_readerz1CategorizedMarkdownCorpusReader.code_block_reader   sl     [[&&v{{}5

 6ww!|*A A 		 6
 	
 
s   AA3c                 <    | j                  | j                  ||      S r?   )r   r   r   s      r    code_blocksz+CategorizedMarkdownCorpusReader.code_blocks   r   r#   c                 <   t        d | j                  j                  |j                                     D cg c]W  }|j                  D ]F  }|j
                  dk(  r5t        |j                  |j                  d      |j                  d            H Y c}}S c c}}w )Nc                      | j                   dk(  S Ninliner   r   s    r    r   z>CategorizedMarkdownCorpusReader.image_reader.<locals>.<lambda>	      !&&H,r#   imagesrctitle)	r   r(   r   r   childrenr   r_   r/   attrGet)r1   r'   inline_tokenchild_tokens       r    image_readerz,CategorizedMarkdownCorpusReader.image_reader  s     !',dkk.?.?.N!
!  ,447* ####E*##G,  5
!
 	
 
s   ABc                 <    | j                  | j                  ||      S r?   )r   r   r   s      r    imagesz&CategorizedMarkdownCorpusReader.images  s    %%d&7&7*MMr#   c                 z   t        d | j                  j                  |j                                     D cg c]s  }t	        |j
                        D ]Y  \  }}|j                  dk(  rEt        |j
                  |dz      j                  |j                  d      |j                  d            [ u c}}}S c c}}}w )Nc                      | j                   dk(  S r   r   r   s    r    r   z=CategorizedMarkdownCorpusReader.link_reader.<locals>.<lambda>  r   r#   	link_openr   hrefr   )
r   r(   r   r   	enumerater   r   r`   r/   r   )r1   r'   r   ir   s        r    link_readerz+CategorizedMarkdownCorpusReader.link_reader  s     !',dkk.?.?.N!
! #,L,A,A"B;;. %%a!e,44##F+##G, #C
!
 	
 
s   A8B6c                 <    | j                  | j                  ||      S r?   )r   r   r   s      r    linksz%CategorizedMarkdownCorpusReader.links!      %%d&6&6LLr#   c                    | j                   j                  |j                               }dt        fd|      }dt        fd|      }t	               }t        ||      D ]?  \  }}|j                  |      }|j                  ||      }	|j                  |||	dz           A |D 
cg c]C  }t        |d   j                  dk(  |D 
cg c]  }
|
j                  s|
j                   c}
      E c}
}S c c}
w c c}
}w )N)bullet_list_openordered_list_openc                 B    | j                   dk(  xr | j                  v S r%   r   )r   opening_typess    r    r   z=CategorizedMarkdownCorpusReader.list_reader.<locals>.<lambda>)      aggl>qvv'>>r#   )bullet_list_closeordered_list_closec                 B    | j                   dk(  xr | j                  v S r%   r   )r   closing_typess    r    r   z=CategorizedMarkdownCorpusReader.list_reader.<locals>.<lambda>-  r   r#   r   r   r   )r(   r   r   r   r   r   r   r   ra   r   r/   )r1   r'   r   r   r   list_blocksr   r   r   r   r   r   r   s              @@r    list_readerz+CategorizedMarkdownCorpusReader.list_reader%  s    ""6;;=1A>
 D>
 f7DAq"LLOM"LLM:Mvmma6GHI 8 &

 &	 q	"55$*8FqaiiF8 &
 	
 9
s   +C<
C7C7*	C<7C<c                 <    | j                  | j                  ||      S r?   )r   r   r   s      r    listsz%CategorizedMarkdownCorpusReader.lists<  r   r#   c                 h   t               t               }}| j                  j                  |j                               D ]]  }|j                  dk(  r8|j
                  dk(  r)|s|j                  |       5|j                  |       |g}J|sM|j                  |       _ |r|j                  |       |D cg c]r  }t        |d   j                  |d   j                  j                  d      | j                  j                  j                  || j                  j                  d             t c}S c c}w )Nr   heading_openr   #r   )r   r(   r   r   r^   r   r   r[   r/   markupcountr   r&   r   )r1   r'   section_blocksr)   r   s        r    section_readerz.CategorizedMarkdownCorpusReader.section_reader@  s    $""6;;=1Aww!|. 8LLO"))%0CEQ 2 !!%( (
 ( a  a%%c*$$++E4;;3F3FD+Q
 (
 	
 
s   5A7D/c                 <    | j                  | j                  ||      S r?   )r   r   r   s      r    sectionsz(CategorizedMarkdownCorpusReader.sectionsV  s    %%d&9&97JOOr#   r?   )NN)r6   rI   rJ   __doc__r2   r"   r   r   r@   rB   rE   rH   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rX   rY   s   @r    rx   rx      sS    )/ H. !+ !+ !+ !+ !? !? !A !A !A !A !A !A


 !Q !Q
( !S !S
 !S !S
 !N !N
 !M !M
. !M !M
, !P !Pr#   rx   N)collectionsr   	functoolsr   r   nltk.corpus.reader.apir   nltk.corpus.reader.plaintextr   nltk.corpus.reader.utilr   r	   nltk.tokenizer
   r   r   r"   r*   r,   rN   r[   r_   r`   ra   rc   rx   rL   r#   r    <module>r     s    " $ : > @ J J2
 
D
 
* m   	7/0&./&-.0 0SP&=?S SPr#   