ó
    ´OÂf›  ã                   ó<   • S r SSKrSSK7  SSK7   " S S\\5      rg)aˆ  
CorpusReader for the Pros and Cons dataset.

- Pros and Cons dataset information -

Contact: Bing Liu, liub@cs.uic.edu
        https://www.cs.uic.edu/~liub

Distributed with permission.

Related papers:

- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
    Proceedings of the 22nd International Conference on Computational Linguistics
    (Coling-2008), Manchester, 18-22 August, 2008.

- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
    Opinions on the Web". Proceedings of the 14th international World Wide Web
    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
é    N)Ú*c                   óT   • \ rS rSrSr\r\" 5       S4S jrSS jr	SS jr
S rS	 rS
rg)ÚProsConsCorpusReaderé"   a®  
Reader for the Pros and Cons sentence dataset.

    >>> from nltk.corpus import pros_cons
    >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
    [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
    'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
    ...]
    >>> pros_cons.words('IntegratedPros.txt')
    ['Easy', 'to', 'use', ',', 'economical', '!', ...]
Úutf8c                 óf   • [         R                  XX$5        [        R                  X5        X0l        g)a}  
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
    into words. Default: `WhitespaceTokenizer`
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
N)ÚCorpusReaderÚ__init__ÚCategorizedCorpusReaderÚ_word_tokenizer)ÚselfÚrootÚfileidsÚword_tokenizerÚencodingÚkwargss         Ú>/usr/lib/python3/dist-packages/nltk/corpus/reader/pros_cons.pyr
   ÚProsConsCorpusReader.__init__1   s(   € ô" 	×Ñ˜d¨'Ô<Ü×(Ñ(¨Ô6Ø-Õó    Nc                 ó  • U R                  X5      nUc  U R                  nO[        U[        5      (       a  U/n[	        U R                  USS5       VVVs/ s H   u  p4nU R                  X0R                  US9PM"     snnn5      $ s  snnnf )a   
Return all sentences in the corpus or in the specified files/categories.

:param fileids: a list or regexp specifying the ids of the files whose
    sentences have to be returned.
:param categories: a list specifying the categories whose sentences
    have to be returned.
:return: the given file(s) as a list of sentences. Each sentence is
    tokenized using the specified word_tokenizer.
:rtype: list(list(str))
T©r   )Ú_resolveÚ_fileidsÚ
isinstanceÚstrÚconcatÚabspathsÚ
CorpusViewÚ_read_sent_block©r   r   Ú
categoriesÚpathÚencÚfileids         r   ÚsentsÚProsConsCorpusReader.sentsF   óˆ   € ð —-‘- Ó4ˆØ‰?Ø—m‘m‰GÜ˜¤×%Ñ%ØiˆGÜð ,0¯=©=¸À$ÈÔ+Mõâ+MÑ'T ð —‘ ×&;Ñ&;ÀcÓJÙ+Móó
ð 	
ùôó   Á'Bc                 ó  • U R                  X5      nUc  U R                  nO[        U[        5      (       a  U/n[	        U R                  USS5       VVVs/ s H   u  p4nU R                  X0R                  US9PM"     snnn5      $ s  snnnf )aw  
Return all words and punctuation symbols in the corpus or in the specified
files/categories.

:param fileids: a list or regexp specifying the ids of the files whose
    words have to be returned.
:param categories: a list specifying the categories whose words have
    to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
Tr   )r   r   r   r   r   r   r   Ú_read_word_blockr    s         r   ÚwordsÚProsConsCorpusReader.words^   r'   r(   c                 ó.  • / n[        S5       Hƒ  nUR                  5       nU(       d  M  [        R                  " SU5      nU(       d  M<  UR	                  U R
                  R                  UR                  S5      R                  5       5      5        M…     U$ )Né   z+^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>é   )	ÚrangeÚreadlineÚreÚmatchÚappendr   ÚtokenizeÚgroupÚstrip)r   Ústreamr%   ÚiÚlineÚsents         r   r   Ú%ProsConsCorpusReader._read_sent_blockv   sr   € ØˆÜr–ˆAØ—?‘?Ó$ˆDÞÙÜ—8’8ÐJÈDÓQˆDßˆtØ—‘˜T×1Ñ1×:Ñ:¸4¿:¹:Àa»=×;NÑ;NÓ;PÓQÖRñ ð ˆr   c                 ó\   • / nU R                  U5       H  nUR                  U5        M     U$ )N)r   Úextend)r   r8   r+   r;   s       r   r*   Ú%ProsConsCorpusReader._read_word_block   s-   € ØˆØ×)Ñ)¨&Ö1ˆDØL‰L˜Öñ 2àˆr   )r   )NN)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__ÚStreamBackedCorpusViewr   ÚWordPunctTokenizerr
   r%   r+   r   r*   Ú__static_attributes__© r   r   r   r   "   s2   † ñ
ð (€Jñ *Ó+Øô.ô*
ô0
ò0	õr   r   )rD   r2   Únltk.corpus.reader.apiÚnltk.tokenizer   r	   r   rH   r   r   Ú<module>rK      s&   ðñó( 
ä $Ü ôcÐ2°Lõ cr   