
    gx                     Z    d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
  G d de      Zy)z{
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
    N)CorpusReader)StreamBackedCorpusViewZipFilePathPointerconcat)TweetTokenizerc                   L    e Zd ZdZeZ	 d e       dfdZd	dZd	dZ	d	dZ
d Zy)
TwitterCorpusReadera7  
    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.

    Individual Tweets can be tokenized using the default tokenizer, or by a
    custom tokenizer specified as a parameter to the constructor.

    Construct a new Tweet corpus reader for a set of documents
    located at the given root directory.

    If you made your own tweet collection in a directory called
    `twitter-files`, then you can initialise the reader as::

        from nltk.corpus import TwitterCorpusReader
        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')

    However, the recommended approach is to set the relevant directory as the
    value of the environmental variable `TWITTER`, and then invoke the reader
    as follows::

       root = os.environ['TWITTER']
       reader = TwitterCorpusReader(root, '.*\.json')

    If you want to work directly with the raw Tweets, the `json` library can
    be used::

       import json
       for tweet in reader.docs():
           print(json.dumps(tweet, indent=1, sort_keys=True))

    Nutf8c                    t        j                  | |||       | j                  | j                        D ]D  }t	        |t
              rt        j                  j                  |      dk(  s7t        d| d       	 || _
        y)a  
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
            smaller units, including but not limited to words.
        r   zFile z	 is emptyN)r   __init__abspaths_fileids
isinstancer   ospathgetsize
ValueError_word_tokenizer)selfrootfileidsword_tokenizerencodingr   s         O/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/twitter.pyr   zTwitterCorpusReader.__init__:   st     	dD'8<MM$--0D$ 23&!+ 5i!899	 1
 	F-    c                     t        | j                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      S c c}}}w )a(  
        Returns the full Tweet objects, as specified by `Twitter
        documentation on Tweets
        <https://dev.twitter.com/docs/platform-objects/tweets>`_

        :return: the given file(s) as a list of dictionaries deserialised
            from JSON.
        :rtype: list(dict)
        T)r   )r   r   
CorpusView_read_tweets)r   r   r   encfileids        r   docszTwitterCorpusReader.docsN   s\      ,0==$+M+M'T3 d&7&7#F+M
 	
s   )Ac                     | j                  |      }g }|D ]D  }	 |d   }t        |t              r|j                  | j                        }|j                  |       F |S # t        $ r Y Tw xY w)z
        Returns only the text content of Tweets in the file(s)

        :return: the given file(s) as a list of Tweets.
        :rtype: list(str)
        text)r!   r   bytesdecoder   appendKeyError)r   r   
fulltweetstweetsjsonor#   s         r   stringszTwitterCorpusReader.strings_   sr     YYw'
EV}dE*;;t}}5Dd#     s   AA	A+*A+c                     | j                  |      }| j                  }|D cg c]  }|j                  |       c}S c c}w )z
        :return: the given file(s) as a list of the text content of Tweets as
            as a list of words, screenanames, hashtags, URLs and punctuation symbols.

        :rtype: list(list(str))
        )r+   r   tokenize)r   r   r)   	tokenizerts        r   	tokenizedzTwitterCorpusReader.tokenizedr   s@     g&((	/56v!	""1%v666s   =c                     g }t        d      D ]>  }|j                         }|s|c S t        j                  |      }|j	                  |       @ |S )zS
        Assumes that each line in ``stream`` is a JSON-serialised object.
        
   )rangereadlinejsonloadsr&   )r   streamr)   ilinetweets         r   r   z TwitterCorpusReader._read_tweets}   sL     rA??$DJJt$EMM%   r   )N)__name__
__module____qualname____doc__r   r   r   r   r!   r+   r0   r    r   r   r	   r	      s8    > (J
 !1AF.(
"&	7r   r	   )r>   r5   r   nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   nltk.tokenizer   r	   r?   r   r   <module>rC      s,   
  	 / V V (s, sr   