
    Of,                     ~    S r SrSSK7  SSKJrJr  SSKJr   " S S\5      rSS	 jr	 " S
 S\
5      r " S S\5      rg)z&
Corpus reader for the SemCor Corpus.
z
epytext en    )*)XMLCorpusReaderXMLCorpusView)Treec                       \ rS rSrSrSS jrSS jrSS jrSS4S jrSS	 jr	SS
 jr
SS4S jrS rS r\S 5       rSrg)SemcorCorpusReader   z
Corpus reader for the SemCor Corpus.
For access to the complete XML data structure, use the ``xml()``
method.  For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
c                 J    [         R                  " XU5        X@l        X0l        g N)r   __init___lazy_wordnet)selfrootfileidswordnetlazys        ;/usr/lib/python3/dist-packages/nltk/corpus/reader/semcor.pyr   SemcorCorpusReader.__init__   s      W5
    Nc                 ,    U R                  USSSS5      $ )zZ
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
wordF_itemsr   r   s     r   wordsSemcorCorpusReader.words    s    
 {{7FE5%@@r   c                 ,    U R                  USSSS5      $ )z
:return: the given file(s) as a list of chunks,
    each of which is a list of words and punctuation symbols
    that form a unit.
:rtype: list(list(str))
chunkFr   r   s     r   chunksSemcorCorpusReader.chunks'   s     {{7GUE5AAr   posc                 8    U R                  USSUS:g  US:g  5      $ )a  
:return: the given file(s) as a list of tagged chunks, represented
    in tree form.
:rtype: list(Tree)

:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
    to indicate the kind of tags to include.  Semantic tags consist of
    WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
    without a specific entry in WordNet.  (Named entities of type 'other'
    have no lemma.  Other chunks not in WordNet have no semantic tag.
    Punctuation tokens have `None` for their part of speech tag.)
r   Fsemr"   r   r   r   tags      r   tagged_chunks SemcorCorpusReader.tagged_chunks0   s#     {{7GUC5L#,OOr   c                 ,    U R                  USSSS5      $ )zx
:return: the given file(s) as a list of sentences, each encoded
    as a list of word strings.
:rtype: list(list(str))
r   TFr   r   s     r   sentsSemcorCorpusReader.sents?   s     {{7FD%??r   c                 ,    U R                  USSSS5      $ )zx
:return: the given file(s) as a list of sentences, each encoded
    as a list of chunks.
:rtype: list(list(list(str)))
r   TFr   r   s     r   chunk_sentsSemcorCorpusReader.chunk_sentsG   s     {{7GT5%@@r   c                 8    U R                  USSUS:g  US:g  5      $ )aC  
:return: the given file(s) as a list of sentences. Each sentence
    is represented as a list of tagged chunks (in tree form).
:rtype: list(list(Tree))

:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
    to indicate the kind of tags to include.  Semantic tags consist of
    WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
    without a specific entry in WordNet.  (Named entities of type 'other'
    have no lemma.  Other chunks not in WordNet have no semantic tag.
    Punctuation tokens have `None` for their part of speech tag.)
r   Tr$   r"   r   r%   s      r   tagged_sentsSemcorCorpusReader.tagged_sentsO   s#     {{7GT3%<NNr   c                    ^  US:X  a  U(       d  U 4S jnO#T R                   (       a  [        OT R                  n[        T R	                  U5       Vs/ s H  nU" XrX4UT R
                  5      PM     sn5      $ s  snf )Nr   c                  b   > [        TR                  (       a  [        OTR                  " U 6 5      $ r   )LazyConcatenationr   SemcorWordView_words)argsr   s    r   <lambda>+SemcorCorpusReader._items.<locals>.<lambda>b   s"    /#'::4;;>Fr   )r   r5   r6   concatabspathsr   )r   r   unitbracket_sentpos_tagsem_tag_fileids   `       r   r   SemcorCorpusReader._items^   sm    6>,A #'**$++A #mmG44F &wN4
 	
s   A7c           	         US;   d   e/ n[         R                  U5      R                  5       nUR                  S5       H  n/ n	[	        U5       HN  n
[
        R                  XXEU R                  5      nUS:X  a  U	R                  U5        M=  U	R                  U5        MP     U(       a*  UR                  [        UR                  S   U	5      5        M  UR                  U	5        M     SU;  d   eU$ )a  
Helper used to implement the view methods -- returns a list of
tokens, (segmented) words, chunks, or sentences. The tokens
and chunks may optionally be tagged (with POS and sense
information).

:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
    and OOV named entity status.
)tokenr   r   z.//sr   snumN)ElementTreeparsegetrootfindall_all_xmlwords_inr   _wordr   extendappendSemcorSentenceattrib)r   rA   r<   r=   r>   r?   resultxmldocxmlsentsentxmlworditms               r   r6   SemcorCorpusReader._wordsn   s     1111""6*224~~f-GD+G4(..7T]] 6>KK$KK$ 5 nW^^F-CTJKd# .  6!!!r   c                    U R                   nU(       d  SnU R                  SU5      nU R                  S5      nUb,  US-   U-   nS[        UR                  S5      S   5      S-
     n	OS =pU R                  S	U5      n
U R                  S
5      nSU R	                  5       ;   nU R                  S5      nUS:X  a2  U(       d  U(       d  UnU$ U4U(       a  U4OS-   U(       a  XiX4OS-   nU$ UR                  S5      nUS:X  a  U$ Ub   UR                  U5      nU(       a  [        X5      /OUnU(       a-  U(       a&  Ub  [        W[        SU5      /5      $ [        SU5      $ U(       a  Ub  [        WU5      $ U(       a  US   $ U$ ! [         a4     SUU	[        U5      4-  n N! [         a    US-   U	-   S-   U-   n  Nf = ff = f)N lemmalexsn%)nvars:r      rdfwnsnpnr"   rD    r@   r   z
%s.%s.%02d.NE)	textgetintsplitkeyslemma_from_key	Exception
ValueErrorr   )rT   r<   r>   r?   r   tknrY   rZ   	sense_keywnposredefsensenumisOOVEntityr"   rU   wwsensebottoms                     r   rK   SemcorCorpusReader._word   s   llCGS)G$e+I-EKK$Q'(1,E !%$I3
 ;;v&glln,kk
 7?7 J	 F!(vb2@Gh<RQ 
 J3Bv~	' ' 6 6y A$ -4$s-{+#EDv,>+?@@#D&11!5v..!!9$!M; % 	$0 % % #H4 %E
  *  %e 3c 9H D "s*   F 
GF##F?;G>F??G)r   r   )Tr   )__name__
__module____qualname____firstlineno____doc__r   r   r    r'   r*   r-   r0   r   r6   staticmethodrK   __static_attributes__rf   r   r   r   r      s_     
AB %)u P@A $(e O
 #J J" J"r   r   Nc                 ~    Uc  / nU  H1  nUR                   S;   a  UR                  U5        M&  [        X!5        M3     U$ )Nwfpunc)r&   rM   rJ   )eltrP   childs      r   rJ   rJ      s=    ~99&MM% U+	 
 Mr   c                       \ rS rSrSrS rSrg)rN      z
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
c                 :    Xl         [        R                  X5        g r   )numlistr   )r   r   itemss      r   r   SemcorSentence.__init__   s    d"r   )r   N)r{   r|   r}   r~   r   r   r   rf   r   r   rN   rN      s    
#r   rN   c                   0    \ rS rSrSrS rS rS rS rSr	g)	r5      zF
A stream backed corpus view specialized for use with the BNC corpus.
c                     U(       a  SnOSnX l         X0l        X@l        XPl        X`l        [
        R                  " XU5        g)aC  
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
    and OOV named entity status.
z.*/sz.*/s/(punc|wf)N)_unit_sent_pos_tag_sem_tagr   r   r   )r   rA   r<   r=   r>   r?   r   tagspecs           r   r   SemcorWordView.__init__   s:     G&G
!
tW5r   c                 h    U R                   (       a  U R                  U5      $ U R                  U5      $ r   )r   handle_senthandle_word)r   r   contexts      r   
handle_eltSemcorWordView.handle_elt  s+    ::##C((##C((r   c                     [         R                  XR                  U R                  U R                  U R
                  5      $ r   )r   rK   r   r   r   r   )r   r   s     r   r   SemcorWordView.handle_word  s.    !''T]]DMM4==
 	
r   c                 $   / nU Hp  nUR                   S;   aG  U R                  U5      nU R                  S:X  a  UR                  U5        MG  UR	                  U5        MZ  [        SUR                   -  5      e   [        UR                  S   U5      $ )Nr   r   zUnexpected element %srE   )r&   r   r   rL   rM   rp   rN   rO   )r   r   rS   r   rU   s        r   r   SemcorWordView.handle_sent  s~    EyyN*&&u-::'KK$KK$ !8599!DEE  cjj0$77r   )r   r   r   r   r   N)
r{   r|   r}   r~   r   r   r   r   r   r   rf   r   r   r5   r5      s    6,)

8r   r5   r   )r   __docformat__nltk.corpus.reader.apinltk.corpus.reader.xmldocsr   r   	nltk.treer   r   rJ   r   rN   r5   rf   r   r   <module>r      sF     $ E K" K"\#T #18] 18r   