
    Of                         S r SSKrSSKJr  SSK7  SSK7  SSK7   " S S5      r " S S\5      r	 " S	 S
\
5      rS rg)a  
Read from the Senseval 2 Corpus.

SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [https://www.siglex.org/]

Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
https://www.d.umn.edu/~tpederse/data.html
Distributed with permission.

The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
    N)ElementTree)*c                        \ rS rSrS rS rSrg)SensevalInstance!   c                 H    Xl         [        U5      U l        X l        X0l        g N)wordtuplesensespositioncontext)selfr
   r   r   r   s        =/usr/lib/python3/dist-packages/nltk/corpus/reader/senseval.py__init__SensevalInstance.__init__"   s    	Fm     c           	      x    SU R                   < SU R                  < SU R                  < SU R                  < S3	$ )NzSensevalInstance(word=z, position=z
, context=z	, senses=))r
   r   r   r   )r   s    r   __repr__SensevalInstance.__repr__(   s(    IIMMLLKK	
 	
r   )r   r   r   r
   N)__name__
__module____qualname____firstlineno__r   r   __static_attributes__ r   r   r   r   !   s    
r   r   c                   $    \ rS rSrSS jrS rSrg)SensevalCorpusReader1   Nc           
          [        U R                  US5       VVs/ s H  u  p#[        X#5      PM     snn5      $ s  snnf )NT)concatabspathsSensevalCorpusView)r   fileidsfileidencs       r   	instancesSensevalCorpusReader.instances2   sC     &*]]7D%A%AMV #6/%A
 	
s   :
c                    / nUR                  S5       Hl  nUR                  S5       HT  nUS   R                  S   nUS    Vs/ s H  ofR                  UR                  S   4PM     nnUR                  XW45        MV     Mn     U$ s  snf )Nlexeltinstancer   senseid   pos)findallattribtextappend)r   treeeltsr+   instsensewr   s           r   _entrySensevalCorpusReader._entry:   s    ll8,Fz2Qy1>B1gFgFFAHHUO4gFU,- 3 -
  Gs   $Br   r	   )r   r   r   r   r(   r9   r   r   r   r   r   r   1   s    
r   r   c                   &    \ rS rSrS rS rS rSrg)r$   D   c                 j    [         R                  XUS9  [        5       U l        S/U l        S /U l        g )N)encodingr   )StreamBackedCorpusViewr   WhitespaceTokenizer_word_tokenizer_lexelt_starts_lexelts)r   r&   r>   s      r   r   SensevalCorpusView.__init__E   s3    ''x'H24 cr   c                    [         R                  U R                  UR                  5       5      S-
  nU R                  U   n/ nSn UR                  5       nUS:X  a
  U/ :X  d   e/ $ UR                  5       R                  S5      (       a  US-  n[        R                  " SU5      nUc   eUR                  S5      SS nU[        U R                  5      :  a  X0R                  U   :X  d   eODU R                  R                  U5        U R                  R                  UR                  5       5        UR                  5       R                  S5      (       a
  U/ :X  d   eSnU(       a  UR                  U5        UR                  5       R                  S	5      (       aD  S
R                  U5      n[        U5      n[        R                   " U5      n	U R#                  X5      /$ GM  )Nr.   FT z<lexeltzitem=("[^"]+"|'[^']+')z	<instancez
</instance
)bisectbisect_rightrB   tellrC   readlinelstrip
startswithresearchgrouplenr3   join_fixXMLr   
fromstring_parse_instance)
r   stream
lexelt_numr+   instance_linesin_instancelinem	xml_blockr6   s
             r   
read_blockSensevalCorpusView.read_blockL   s   (()<)<fkkmLqP
z*??$Drz%+++	 {{}''	22a
II94@}$}Ab)DMM 22!]]:%>>>>MM((0''..v{{}= {{}''44%+++" %%d+ {{}''55 IIn5	#I.	"--i8,,T:;;A r   c                 l   / n/ nS nU GH  nUR                   S:X  a   UR                  UR                  S   5        M4  UR                   S:X  GaF  X@R                  R	                  UR
                  5      -  nU GH  nUR                   S:X  a  US   nUR                   S:X  GaG  Ub   S5       eUR
                  R                  5       (       d  [        U5      S:X  d   eUR
                  R                  5       (       a  [        U5      S:X  a   e[        U5      nUR
                  R                  5       (       a*  UR                  UR
                  R                  5       5        OUS   R                   S	:X  ao  UR                  US   R
                  US   R                  S
   45        US   R                  (       a*  X@R                  R	                  US   R                  5      -  nOp S5       eUR                   S	:X  a+  UR                  UR
                  UR                  S
   45        O.UR                   S:X  a  O[        SUR                   5         S5       eUR                  (       d  GM  X@R                  R	                  UR                  5      -  nGM     GM   SUR                   -  5       e   [        X%XC5      $ )Nanswerr-   r   compoundr   headzhead specified twicer.   wfr/   zexpected CDATA or wf in <head>sACKz expected CDATA or <wf> or <head>zunexpected tag %s)tagr3   r1   rA   tokenizer2   striprR   tailprintr   )r   r,   r+   r   r   r   childcwords           r   rV   "SensevalCorpusView._parse_instanceu   s,   EyyH$ell956i'//88DD"EyyJ. %ayyF*'/G1GG/$zz//11SZ1_DD$)JJ$4$4$6$63u:?KK#&w< ::++--#NN5::+;+;+=>"1X\\T1#NNE!HMM58??5;Q+RS$Qx}} '+?+?+H+Hq+W WJ*JJ5d*

ELL4G'HIc) eUYY/H&HHuzzz#7#7#@#@#LL; #> >1EII==uI J  'BBr   )rB   rC   rA   N)r   r   r   r   r   r^   rV   r   r   r   r   r$   r$   D   s    '<R)Cr   r$   c                    [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " S	S
U 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      n U $ )z2
Fix the various issues with Senseval pseudo-XML.
z	<([~\^])>z\1z(\s+)\&(\s+)z	\1&amp;\2z"""z'"'z(<[^<]*snum=)([^">]+)>z\1"\2"/>z<\&frasl>\s*<p[^>]*>FRASLz
<\&I[^>]*>rF   z<{([^}]+)}>z	<(@|/?p)>z	<&\w+ \.>z<!DOCTYPE[^>]*>z<\[\/?[^>]+\]*>z
<(\&\w+;)>z&(?!amp|gt|lt|apos|quot)z'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>z <wf pos="\2">\1</wf>z\s*"\s*<p=\'"\'/>z <wf pos='"'>"</wf>)rO   sub)r2   s    r   rT   rT      s$   
 66,t,D66/<6D66&&$'D66+[$?D66)7D9D66-T*D66.%.D66,T*D66,T*D66$c40D66$c40D66--D66-sD9D6624LdD 66&(?FDKr   )__doc__rO   	xml.etreer   nltk.corpus.reader.apinltk.corpus.reader.utilnltk.tokenizer   CorpusReaderr   r?   r$   rT   r   r   r   <module>rx      sH     
 ! $ % 
 
 < &ZC/ ZCz#r   