
    Of-                     J   S r SSKrSSK7  SSK7  \R                  " S5      r\R                  " S5      r\R                  " S5      r\R                  " S5      r\R                  " S5      r	\R                  " S	5      r
\R                  " S
5      r " S S5      r " S S\5      rg)a  
CorpusReader for the Comparative Sentence Dataset.

- Comparative Sentence Dataset information -

Annotated by: Nitin Jindal and Bing Liu, 2006.
              Department of Computer Sicence
              University of Illinois at Chicago

Contact: Nitin Jindal, njindal@cs.uic.edu
         Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)

Distributed with permission.

Related papers:

- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
   Proceedings of the ACM SIGIR International Conference on Information Retrieval
   (SIGIR-06), 2006.

- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
   Proceedings of Twenty First National Conference on Artificial Intelligence
   (AAAI-2006), 2006.

- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
    Proceedings of the 22nd International Conference on Computational Linguistics
    (Coling-2008), Manchester, 18-22 August, 2008.
    N)*z^\*+$z<cs-[1234]>z</cs-[1234]>z
<cs-[123]>z<cs-4>z(\d)_((?:[\.\w\s/-](?!\d_))+)z\(([^\(]*)\)$c                   4    \ rS rSrSr      SS jrS rSrg)
Comparison3   zF
A Comparison represents a comparative sentence and its constituents.
Nc                 L    Xl         X l        X0l        X@l        XPl        X`l        g)a  
:param text: a string (optionally tokenized) containing a comparison.
:param comp_type: an integer defining the type of comparison expressed.
    Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
    4 (Non-gradable).
:param entity_1: the first entity considered in the comparison relation.
:param entity_2: the second entity considered in the comparison relation.
:param feature: the feature considered in the comparison relation.
:param keyword: the word or phrase which is used for that comparative relation.
N)text	comp_typeentity_1entity_2featurekeyword)selfr   r	   r
   r   r   r   s          F/usr/lib/python3/dist-packages/nltk/corpus/reader/comparative_sents.py__init__Comparison.__init__8   s"    & 	"      c                     SR                  U R                  U R                  U R                  U R                  U R
                  U R                  5      $ )Nz]Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", feature="{}", keyword="{}"))formatr   r	   r
   r   r   r   )r   s    r   __repr__Comparison.__repr__R   s@    *
&IINNMMMMLLLL

	
r   )r	   r
   r   r   r   r   )NNNNNN)__name__
__module____qualname____firstlineno____doc__r   r   __static_attributes__ r   r   r   r   3   s%     4
r   r   c                   |    \ rS rSrSr\r\" 5       SS4S jrSS jr	SS jr
S rSS	 jrSS
 jrS rS rS rS rSrg) ComparativeSentencesCorpusReader`   aO  
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).

    >>> from nltk.corpus import comparative_sentences
    >>> comparison = comparative_sentences.comparisons()[0]
    >>> comparison.text # doctest: +NORMALIZE_WHITESPACE
    ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
    'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
    'had', '.']
    >>> comparison.entity_2
    'models'
    >>> (comparison.feature, comparison.keyword)
    ('rewind', 'more')
    >>> len(comparative_sentences.comparisons())
    853
Nutf8c                 V    [         R                  XX%5        X0l        X@l        SU l        g)a~  
:param root: The root directory for this corpus.
:param fileids: a list or regexp specifying the fileids in this corpus.
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
    into words. Default: `WhitespaceTokenizer`
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
:param encoding: the encoding that should be used to read the corpus.
z
README.txtN)CorpusReaderr   _word_tokenizer_sent_tokenizer_readme)r   rootfileidsword_tokenizersent_tokenizerencodings         r   r   )ComparativeSentencesCorpusReader.__init__t   s'    " 	d'<--#r   c                     Uc  U R                   nO[        U[        5      (       a  U/n[        U R	                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ s  snnnf )z
Return all comparisons in the corpus.

:param fileids: a list or regexp specifying the ids of the files whose
    comparisons have to be returned.
:return: the given file(s) as a list of Comparison objects.
:rtype: list(Comparison)
Tr+   )_fileids
isinstancestrconcatabspaths
CorpusView_read_comparison_blockr   r(   pathencfileids        r   comparisons,ComparativeSentencesCorpusReader.comparisons   sz     ?mmG%%iG ,0==$+M+M'T &A&ACP+M
 	
s   'A5c                    [        U R                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      nU Vs1 s H  of(       d  M  UR	                  5       iM     nnU$ s  snnnf s  snf )z
Return a set of all keywords used in the corpus.

:param fileids: a list or regexp specifying the ids of the files whose
    keywords have to be returned.
:return: the set of keywords and comparative phrases used in the corpus.
:rtype: set(str)
Tr.   )r2   r3   r4   _read_keyword_blocklower)r   r(   r7   r8   r9   all_keywordsr   keywords_sets           r   keywords)ComparativeSentencesCorpusReader.keywords   s      ,0==$+M+M'T &>&>M+M
 8DO|Gw|O Ps   'A7
A>A>c                 ,   / nU R                  S5       nUR                  5       nSSS5        WR                  S5       HA  nU(       a  UR                  S5      (       a  M"  UR	                  UR                  5       5        MC     U$ ! , (       d  f       Nf= f)zj
Return the list of words and constituents considered as clues of a
comparison (from listOfkeywords.txt).
zlistOfkeywords.txtN
z//)openreadsplit
startswithappendstrip)r   rA   fpraw_textlines        r   keywords_readme0ComparativeSentencesCorpusReader.keywords_readme   sq    
 YY+,wwyH -NN4(D4??400OODJJL) )  -,s   B
Bc                     [        U R                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ s  snnnf )a+  
Return all sentences in the corpus.

:param fileids: a list or regexp specifying the ids of the files whose
    sentences have to be returned.
:return: all sentences of the corpus as lists of tokens (or as plain
    strings, if no word tokenizer is specified).
:rtype: list(list(str)) or list(str)
Tr.   )r2   r3   r4   _read_sent_blockr6   s        r   sents&ComparativeSentencesCorpusReader.sents   sX      ,0==$+M+M'T &;&;cJ+M
 	
   'Ac                     [        U R                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ s  snnnf )z
Return all words and punctuation symbols in the corpus.

:param fileids: a list or regexp specifying the ids of the files whose
    words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
Tr.   )r2   r3   r4   _read_word_blockr6   s        r   words&ComparativeSentencesCorpusReader.words   sX      ,0==$+M+M'T &;&;cJ+M
 	
rT   c                 |    UR                  5       nU(       d  / $ [        R                  " [        U5      nU(       Ga  [        R                  " [        U5      n[        R                  " [
        U5      nUR                  5       R                  5       nU R                  (       a  U R                  R                  U5      nUR                  5         / nU(       Ga  U GH  n[        [        R                  " SU5      R                  S5      5      n	[        XiS9n
UR                  5       n[        R                  U5      nU(       ab  U H\  u  pUS:X  a  UR                  5       U
l        M"  US:X  a  UR                  5       U
l        M?  US:X  d  MG  UR                  5       U
l        M^     ["        R                  U5      nU(       a
  US   U
l        UR'                  U
5        GM     U(       aR  U HL  n[        [        R                  " SU5      R                  S5      5      n	[        XiS9n
UR'                  U
5        MN     U$ GM<  )Nz	<cs-(\d)>   )r   r	   123r   )readlinerefindall
COMPARISONGRAD_COMPARISONNON_GRAD_COMPARISONrJ   r$   tokenizeintmatchgroupr   ENTITIES_FEATSr
   r   r   KEYWORDr   rI   )r   streamrM   comparison_tagsgrad_comparisonsnon_grad_comparisonscomparison_textcomparison_bundlecompr	   
comparisonentities_featscodeentity_featr   s                  r   r5   7ComparativeSentencesCorpusReader._read_comparison_block   s   ??$D	 jjT:O#%::ot#D ')zz2Et'L$"(//"3"9"9";''&*&:&:&C&CO&TO! %'!# 0$'t(D(J(J1(M$N	%/!0&
  &0)7)?)?)E)5C 1#'3;:E:K:K:MJ$7%)S[:E:K:K:MJ$7%)S[9D9J9J9LJ$6 6D #*//$"7"18J.)00<% !1* ( 4$'t(D(J(J1(M$N	%/!0&
 *00< !5 )(e r   c                 p    / nU R                  U5       H  nUR                  UR                  5        M      U$ N)r5   rI   r   )r   rj   rA   rq   s       r   r=   4ComparativeSentencesCorpusReader._read_keyword_block  s3    55f=JOOJ../ >r   c                 x    UR                  5       n[        R                  " [        U5      (       a6   UR                  5       n[        R                  " [        U5      (       a  OM3  Mg  [        R                  " [
        U5      (       d  [        R	                  U5      (       d  [        R                  " [        U5      (       dr  U R                  (       aE  U R                  R                  U5       Vs/ s H  nU R                  R                  U5      PM      sn$ U R                  R                  U5      /$ GM5  s  snf rw   )r^   r_   rf   STARSr`   ra   rh   CLOSE_COMPARISONr%   rd   r$   )r   rj   rM   sents       r   rQ   1ComparativeSentencesCorpusReader._read_sent_block  s    ??$Dxxt$$!??,Dxxt,,  JJz400&..t44

#3T::'' %)$8$8$A$A$$G$GD ,,55d;$G 
 !0099$?@@' s   0%D7c                 \    / nU R                  U5       H  nUR                  U5        M     U$ rw   )rQ   extend)r   rj   rW   r|   s       r   rV   1ComparativeSentencesCorpusReader._read_word_block1  s-    ))&1DLL 2r   )r&   r%   r$   rw   )r   r   r   r   r   StreamBackedCorpusViewr4   WhitespaceTokenizerr   r:   rA   rN   rR   rW   r5   r=   rQ   rV   r   r   r   r   r   r   `   sP    " (J +,$,
(&
"
 3)jA,r   r   )r   r_   nltk.corpus.reader.apinltk.tokenizecompilerz   ra   r{   rb   rc   rh   ri   r   r#   r   r   r   r   <module>r      s   8 
 $  	

8ZZ'
::o. **]+jj+ <=
**%
&*
 *
ZU| Ur   