
    Of.                         S r SSKrSSK7  SSK7  \R                  " S5      r\R                  " S5      r\R                  " S5      r\R                  " S5      r " S S	5      r	 " S
 S5      r
 " S S\5      rg)a	  
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).

Customer Review Corpus information
==================================

Annotated by: Minqing Hu and Bing Liu, 2004.
    Department of Computer Science
    University of Illinois at Chicago

Contact: Bing Liu, liub@cs.uic.edu
        https://www.cs.uic.edu/~liub

Distributed with permission.

The "product_reviews_1" and "product_reviews_2" datasets respectively contain
annotated customer reviews of 5 and 9 products from amazon.com.

Related papers:

- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
    Proceedings of the ACM SIGKDD International Conference on Knowledge
    Discovery & Data Mining (KDD-04), 2004.

- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
    Proceedings of Nineteeth National Conference on Artificial Intelligence
    (AAAI-2004), 2004.

- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
    Opinion Mining." Proceedings of First ACM International Conference on Web
    Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
    Stanford, California, USA.

Symbols used in the annotated reviews:

    :[t]: the title of the review: Each [t] tag starts a review.
    :xxxx[+|-n]: xxxx is a product feature.
    :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
           Note that the strength is quite subjective.
           You may want ignore it, but only considering + and -
    :[-n]: Negative opinion
    :##:   start of each sentence. Each line is a sentence.
    :[u]:  feature not appeared in the sentence.
    :[p]:  feature not appeared in the sentence. Pronoun resolution is needed.
    :[s]:  suggestion or recommendation.
    :[cc]: comparison with a competing product from a different brand.
    :[cs]: comparison with a competing product from the same brand.

Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
    provide separation between different reviews. This is due to the fact that
    the dataset was specifically designed for aspect/feature-based sentiment
    analysis, for which sentence-level annotation is sufficient. For document-
    level classification and analysis, this peculiarity should be taken into
    consideration.
    N)*z^\[t\](.*)$z%((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]z\[(?!t)(p|u|s|cc|cs)\]z##(.*)$c                   :    \ rS rSrSrS
S jrS rS rS rS r	S	r
g)ReviewM   z6
A Review is the main block of a ReviewsCorpusReader.
Nc                 2    Xl         Uc  / U l        gX l        g)zt
:param title: the title of the review.
:param review_lines: the list of the ReviewLines that belong to the Review.
N)titlereview_lines)selfr   r	   s      </usr/lib/python3/dist-packages/nltk/corpus/reader/reviews.py__init__Review.__init__R   s    
 
 "D ,    c                 h    [        U[        5      (       d   eU R                  R                  U5        g)zo
Add a line (ReviewLine) to the review.

:param review_line: a ReviewLine instance that belongs to the Review.
N)
isinstance
ReviewLiner	   appendr
   review_lines     r   add_lineReview.add_line]   s+     +z2222  -r   c                 f    / nU R                    H  nUR                  UR                  5        M      U$ )z
Return a list of features in the review. Each feature is a tuple made of
the specific item feature and the opinion strength about that feature.

:return: all features of the review as a list of tuples (feat, score).
:rtype: list(tuple)
)r	   extendfeatures)r
   r   r   s      r   r   Review.featuresf   s0     ,,KOOK001 -r   c                 X    U R                    Vs/ s H  oR                  PM     sn$ s  snf )z
Return all tokenized sentences in the review.

:return: all sentences of the review as lists of tokens.
:rtype: list(list(str))
)r	   sentr   s     r   sentsReview.sentss   s(     594E4EF4E[  4EFFFs   'c                 N    SR                  U R                  U R                  5      $ )Nz#Review(title="{}", review_lines={}))formatr   r	   r
   s    r   __repr__Review.__repr__|   s$    4;;JJ))
 	
r   )r	   r   NN)__name__
__module____qualname____firstlineno____doc__r   r   r   r   r"   __static_attributes__ r   r   r   r   M   s!    	-.G
r   r   c                   (    \ rS rSrSrSS jrS rSrg)r      z
A ReviewLine represents a sentence of the review, together with (optional)
annotations of its features and notes about the reviewed item.
Nc                 T    Xl         Uc  / U l        OX l        Uc  / U l        g X0l        g Nr   r   notes)r
   r   r   r1   s       r   r   ReviewLine.__init__   s*    	DM$M=DJJr   c                 d    SR                  U R                  U R                  U R                  5      $ )Nz*ReviewLine(features={}, notes={}, sent={}))r    r   r1   r   r!   s    r   r"   ReviewLine.__repr__   s(    ;BBMM4::tyy
 	
r   )r   r1   r   r$   )r%   r&   r'   r(   r)   r   r"   r*   r+   r   r   r   r      s    


r   r   c                   t    \ rS rSrSr\r\" 5       S4S jrSS jr	SS jr
SS jrSS	 jrS
 rS rS rS rSrg)ReviewsCorpusReader   aR  
Reader for the Customer Review Data dataset by Hu, Liu (2004).
Note: we are not applying any sentence tokenization at the moment, just word
tokenization.

    >>> from nltk.corpus import product_reviews_1
    >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
    >>> review = camera_reviews[0]
    >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
    ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
    'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
    >>> review.features() # doctest: +NORMALIZE_WHITESPACE
    [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
    ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
    ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
    ('option', '+1')]

We can also reach the same information directly from the stream:

    >>> product_reviews_1.features('Canon_G3.txt')
    [('canon powershot g3', '+3'), ('use', '+2'), ...]

We can compute stats for specific product features:

    >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
    >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
    >>> mean = tot / n_reviews
    >>> print(n_reviews, tot, mean)
    15 24 1.6
utf8c                 J    [         R                  XX$5        X0l        SU l        g)a4  
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
    into words. Default: `WordPunctTokenizer`
:param encoding: the encoding that should be used to read the corpus.
z
README.txtN)CorpusReaderr   _word_tokenizer_readme)r
   rootfileidsword_tokenizerencodings        r   r   ReviewsCorpusReader.__init__   s!     	d'<-#r   Nc                     Uc  U R                   nO[        U[        5      (       a  U/n[        U R	                  US5       VVs/ s H  u  p#U R                  X R                  US9PM!     snn5      $ s  snnf )a=  
Return a list of features. Each feature is a tuple made of the specific
item feature and the opinion strength about that feature.

:param fileids: a list or regexp specifying the ids of the files whose
    features have to be returned.
:return: all features for the item(s) in the given file(s).
:rtype: list(tuple)
Tr@   )_fileidsr   strconcatabspaths
CorpusView_read_featuresr
   r>   fileidencs       r   r   ReviewsCorpusReader.features   su     ?mmG%%iG &*]]7D%A%AMV (;(;cJ%A
 	
s   &A1
c                     Uc  U R                   n[        U R                  US5       VVs/ s H  u  p#U R                  X R                  US9PM!     snn5      $ s  snnf )a#  
Return all the reviews as a list of Review objects. If `fileids` is
specified, return all the reviews from each of the specified files.

:param fileids: a list or regexp specifying the ids of the files whose
    reviews have to be returned.
:return: the given file(s) as a list of reviews.
TrC   )rD   rF   rG   rH   _read_review_blockrJ   s       r   reviewsReviewsCorpusReader.reviews   sa     ?mmG &*]]7D%A%AMV (?(?#N%A
 	
s   &A
c                     [        U R                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ s  snnnf )a!  
Return all sentences in the corpus or in the specified files.

:param fileids: a list or regexp specifying the ids of the files whose
    sentences have to be returned.
:return: the given file(s) as a list of sentences, each encoded as a
    list of word strings.
:rtype: list(list(str))
TrC   )rF   rG   rH   _read_sent_blockr
   r>   pathrL   rK   s        r   r   ReviewsCorpusReader.sents   X      ,0==$+M+M'T &;&;cJ+M
 	
   'Ac                     [        U R                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ s  snnnf )a  
Return all words and punctuation symbols in the corpus or in the specified
files.

:param fileids: a list or regexp specifying the ids of the files whose
    words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
TrC   )rF   rG   rH   _read_word_blockrT   s        r   wordsReviewsCorpusReader.words  rW   rX   c                     / n[        S5       HH  nUR                  5       nU(       d  Us  $ UR                  [        R                  " [
        U5      5        MJ     U$ )N   )rangereadliner   refindallFEATURES)r
   streamr   ilines        r   rI   "ReviewsCorpusReader._read_features  sG    rA??$DOOBJJx67	 
 r   c                     UR                  5       nU(       d  / $ [        R                  " [        U5      nU(       a'  [	        UR                  S5      R                  5       S9nOMd   UR                  5       nUR                  5       nU(       d  U/$ [        R                  " [        U5      (       a  UR                  U5        U/$ [        R                  " [        U5      n[        R                  " [        U5      n[        R                  " [        U5      nU(       a  U R                  R                  US   5      n[        XUS9n	UR!                  U	5        M  )N   )r   r   r0   )r`   ra   matchTITLEr   groupstriptellseekrb   rc   NOTESSENTr;   tokenizer   r   )
r
   rd   rf   title_matchreviewoldposfeatsr1   r   r   s
             r   rO   &ReviewsCorpusReader._read_review_block  s   ??$D	((5$/K%++A.446   [[]F??$Dx xxt$$F#xJJx.EJJud+E::dD)D++44T!W=$$eLKOOK(% r   c                     / nU R                  U5       H2  nUR                  UR                  5        Vs/ s H  oDPM     sn5        M4     U$ s  snf r/   )rO   r   r   )r
   rd   r   rt   r   s        r   rS   $ReviewsCorpusReader._read_sent_block>  sG    --f5FLL6<<>:>4$>:; 6 ;s   A
c                     / n[        S5       Hd  nUR                  5       n[        R                  " [        U5      nU(       d  M7  UR                  U R                  R                  US   5      5        Mf     U$ )Nr^   r   )r_   r`   ra   rb   rq   r   r;   rr   )r
   rd   r[   re   rf   r   s         r   rZ   $ReviewsCorpusReader._read_word_blockD  s^    rA??$D::dD)DtT11::47CD	 
 r   )r<   r;   r/   )r%   r&   r'   r(   r)   StreamBackedCorpusViewrH   WordPunctTokenizerr   r   rP   r   r[   rI   rO   rS   rZ   r*   r+   r   r   r6   r6      sF    > (J -?,@6$
*
$
"
")Br   r6   )r)   ra   nltk.corpus.reader.apinltk.tokenizecompilerk   rc   rp   rq   r   r   r:   r6   r+   r   r   <module>r      sw   6p 
 $ 


>"::, 	

,-	zz*2
 2
j
 
0q, qr   