
    g.                         d Z ddlZddl ddl  ej                  d      Z ej                  d      Z ej                  d      Z ej                  d      Z G d d	      Z	 G d
 d      Z
 G d de      Zy)a	  
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).

Customer Review Corpus information
==================================

Annotated by: Minqing Hu and Bing Liu, 2004.
    Department of Computer Science
    University of Illinois at Chicago

Contact: Bing Liu, liub@cs.uic.edu
        https://www.cs.uic.edu/~liub

Distributed with permission.

The "product_reviews_1" and "product_reviews_2" datasets respectively contain
annotated customer reviews of 5 and 9 products from amazon.com.

Related papers:

- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
    Proceedings of the ACM SIGKDD International Conference on Knowledge
    Discovery & Data Mining (KDD-04), 2004.

- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
    Proceedings of Nineteeth National Conference on Artificial Intelligence
    (AAAI-2004), 2004.

- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
    Opinion Mining." Proceedings of First ACM International Conference on Web
    Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
    Stanford, California, USA.

Symbols used in the annotated reviews:

    :[t]: the title of the review: Each [t] tag starts a review.
    :xxxx[+|-n]: xxxx is a product feature.
    :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
           Note that the strength is quite subjective.
           You may want ignore it, but only considering + and -
    :[-n]: Negative opinion
    :##:   start of each sentence. Each line is a sentence.
    :[u]:  feature not appeared in the sentence.
    :[p]:  feature not appeared in the sentence. Pronoun resolution is needed.
    :[s]:  suggestion or recommendation.
    :[cc]: comparison with a competing product from a different brand.
    :[cs]: comparison with a competing product from the same brand.

Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
    provide separation between different reviews. This is due to the fact that
    the dataset was specifically designed for aspect/feature-based sentiment
    analysis, for which sentence-level annotation is sufficient. For document-
    level classification and analysis, this peculiarity should be taken into
    consideration.
    N)*z^\[t\](.*)$z%((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]z\[(?!t)(p|u|s|cc|cs)\]z##(.*)$c                   0    e Zd ZdZddZd Zd Zd Zd Zy)	Reviewz>
    A Review is the main block of a ReviewsCorpusReader.
    Nc                 4    || _         |g | _        y|| _        y)z
        :param title: the title of the review.
        :param review_lines: the list of the ReviewLines that belong to the Review.
        N)titlereview_lines)selfr   r   s      O/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/reviews.py__init__zReview.__init__R   s!    
 
 "D ,D    c                 ^    t        |t              sJ | j                  j                  |       y)z
        Add a line (ReviewLine) to the review.

        :param review_line: a ReviewLine instance that belongs to the Review.
        N)
isinstance
ReviewLiner   appendr	   review_lines     r
   add_linezReview.add_line]   s(     +z222  -r   c                 b    g }| j                   D ]  }|j                  |j                          |S )a  
        Return a list of features in the review. Each feature is a tuple made of
        the specific item feature and the opinion strength about that feature.

        :return: all features of the review as a list of tuples (feat, score).
        :rtype: list(tuple)
        )r   extendfeatures)r	   r   r   s      r
   r   zReview.featuresf   s0     ,,KOOK001 -r   c                 T    | j                   D cg c]  }|j                   c}S c c}w )z
        Return all tokenized sentences in the review.

        :return: all sentences of the review as lists of tokens.
        :rtype: list(list(str))
        )r   sentr   s     r
   sentszReview.sentss   s*     594E4EF4E[  4EFFFs   %c                 N    dj                  | j                  | j                        S )Nz#Review(title="{}", review_lines={}))formatr   r   r	   s    r
   __repr__zReview.__repr__|   s$    4;;JJ))
 	
r   NN)	__name__
__module____qualname____doc__r   r   r   r   r    r   r
   r   r   M   s!    	-.G
r   r   c                       e Zd ZdZddZd Zy)r   z
    A ReviewLine represents a sentence of the review, together with (optional)
    annotations of its features and notes about the reviewed item.
    Nc                 V    || _         |g | _        n|| _        |g | _        y || _        y Nr   r   notes)r	   r   r   r(   s       r
   r   zReviewLine.__init__   s0    	DM$DM=DJDJr   c                 d    dj                  | j                  | j                  | j                        S )Nz*ReviewLine(features={}, notes={}, sent={}))r   r   r(   r   r   s    r
   r   zReviewLine.__repr__   s(    ;BBMM4::tyy
 	
r   r   )r   r    r!   r"   r   r   r#   r   r
   r   r      s    


r   r   c                   b    e Zd ZdZeZ e       dfdZddZddZ	ddZ
ddZd	 Zd
 Zd Zd Zy)ReviewsCorpusReadera  
    Reader for the Customer Review Data dataset by Hu, Liu (2004).
    Note: we are not applying any sentence tokenization at the moment, just word
    tokenization.

        >>> from nltk.corpus import product_reviews_1
        >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
        >>> review = camera_reviews[0]
        >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
        ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
        'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
        >>> review.features() # doctest: +NORMALIZE_WHITESPACE
        [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
        ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
        ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
        ('option', '+1')]

    We can also reach the same information directly from the stream:

        >>> product_reviews_1.features('Canon_G3.txt')
        [('canon powershot g3', '+3'), ('use', '+2'), ...]

    We can compute stats for specific product features:

        >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
        >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
        >>> mean = tot / n_reviews
        >>> print(n_reviews, tot, mean)
        15 24 1.6
    utf8c                 P    t         j                  | |||       || _        d| _        y)ad  
        :param root: The root directory for the corpus.
        :param fileids: a list or regexp specifying the fileids in the corpus.
        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
            into words. Default: `WordPunctTokenizer`
        :param encoding: the encoding that should be used to read the corpus.
        z
README.txtN)CorpusReaderr   _word_tokenizer_readme)r	   rootfileidsword_tokenizerencodings        r
   r   zReviewsCorpusReader.__init__   s'     	dD'8<-#r   Nc                     || j                   }nt        |t              r|g}t        | j	                  |d      D cg c]#  \  }}| j                  || j                  |      % c}}      S c c}}w )au  
        Return a list of features. Each feature is a tuple made of the specific
        item feature and the opinion strength about that feature.

        :param fileids: a list or regexp specifying the ids of the files whose
            features have to be returned.
        :return: all features for the item(s) in the given file(s).
        :rtype: list(tuple)
        Tr4   )_fileidsr   strconcatabspaths
CorpusView_read_featuresr	   r2   fileidencs       r
   r   zReviewsCorpusReader.features   sv     ?mmG%iG &*]]7D%A%AMVS (;(;cJ%A
 	
s   (A-
c                     || j                   }t        | j                  |d      D cg c]#  \  }}| j                  || j                  |      % c}}      S c c}}w )aS  
        Return all the reviews as a list of Review objects. If `fileids` is
        specified, return all the reviews from each of the specified files.

        :param fileids: a list or regexp specifying the ids of the files whose
            reviews have to be returned.
        :return: the given file(s) as a list of reviews.
        Tr6   )r7   r9   r:   r;   _read_review_blockr=   s       r
   reviewszReviewsCorpusReader.reviews   se     ?mmG &*]]7D%A%AMVS (?(?#N%A
 	
s   (A
c                     t        | j                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      S c c}}}w )aY  
        Return all sentences in the corpus or in the specified files.

        :param fileids: a list or regexp specifying the ids of the files whose
            sentences have to be returned.
        :return: the given file(s) as a list of sentences, each encoded as a
            list of word strings.
        :rtype: list(list(str))
        Tr6   )r9   r:   r;   _read_sent_blockr	   r2   pathr?   r>   s        r
   r   zReviewsCorpusReader.sents   \      ,0==$+M+M'T3 d&;&;cJ+M
 	
   )Ac                     t        | j                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      S c c}}}w )aK  
        Return all words and punctuation symbols in the corpus or in the specified
        files.

        :param fileids: a list or regexp specifying the ids of the files whose
            words have to be returned.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        Tr6   )r9   r:   r;   _read_word_blockrE   s        r
   wordszReviewsCorpusReader.words  rG   rH   c                     g }t        d      D ]A  }|j                         }|s|c S |j                  t        j                  t
        |             C |S )N   )rangereadliner   refindallFEATURES)r	   streamr   ilines        r
   r<   z"ReviewsCorpusReader._read_features  sG    rA??$DOOBJJx67	 
 r   c                 ~   	 |j                         }|sg S t        j                  t        |      }|r*t	        |j                  d      j                               }n[	 |j                         }|j                         }|s|gS t        j                  t        |      r|j                  |       |gS t        j                  t        |      }t        j                  t        |      }t        j                  t        |      }|r| j                  j                  |d         }t        |||      }	|j!                  |	       )N   )r   r   r'   )rO   rP   matchTITLEr   groupstriptellseekrQ   rR   NOTESSENTr/   tokenizer   r   )
r	   rS   rU   title_matchreviewoldposfeatsr(   r   r   s
             r
   rA   z&ReviewsCorpusReader._read_review_block  s   ??$D	((5$/K%++A.446   [[]F??$Dx xxt$F#xJJx.EJJud+E::dD)D++44T!W=$$eLKOOK(% r   c                     g }| j                  |      D ]/  }|j                  |j                         D cg c]  }| c}       1 |S c c}w r&   )rA   r   r   )r	   rS   r   rb   r   s        r
   rD   z$ReviewsCorpusReader._read_sent_block>  sG    --f5FLL6<<>:>4$>:; 6 ;s   	A
c                     g }t        d      D ]\  }|j                         }t        j                  t        |      }|s0|j                  | j                  j                  |d                ^ |S )NrM   r   )rN   rO   rP   rQ   r_   r   r/   r`   )r	   rS   rK   rT   rU   r   s         r
   rJ   z$ReviewsCorpusReader._read_word_blockD  s\    rA??$D::dD)DT11::47CD	 
 r   r&   )r   r    r!   r"   StreamBackedCorpusViewr;   WordPunctTokenizerr   r   rB   r   rK   r<   rA   rD   rJ   r#   r   r
   r+   r+      sF    > (J -?,@6$
*
$
"
")Br   r+   )r"   rP   nltk.corpus.reader.apinltk.tokenizecompilerY   rR   r^   r_   r   r   r.   r+   r#   r   r
   <module>rl      s   6p 
 $ 

>"2::, 	

,-rzz*2
 2
j
 
0q, qr   