
    Of6                         S r SSKrSSKrSSKJr  SSKJrJr  SSKJ	r	  S r
 " S S\	5      r " S	 S
5      r " S S5      r " S S\5      rg)z9
A reader for corpora whose documents are in MTE format.
    N)reduce)TaggedCorpusReaderconcat)XMLCorpusViewc                 $    U R                  X5      $ N)findall)rootpathnss      8/usr/lib/python3/dist-packages/nltk/corpus/reader/mte.pyxpathr      s    <<!!    c                   ,    \ rS rSrSrSS jrSS jrSrg)	MTECorpusView   z(
Class for lazy viewing the MTE Corpus.
Nc                 2    [         R                  " XX#5        g r   )r   __init__)selffileidtagspecelt_handlers       r   r   MTECorpusView.__init__   s    tWBr   c                 X    [        [        S [        R                  " XX#5      5      5      $ )Nc                 
    U S L$ r    xs    r   <lambda>*MTECorpusView.read_block.<locals>.<lambda>       !4-r   )listfilterr   
read_block)r   streamr   r   s       r   r$   MTECorpusView.read_block   s*    '((wL
 	
r   r   r   )NN)__name__
__module____qualname____firstlineno____doc__r   r$   __static_attributes__r   r   r   r   r      s    C
r   r   c                      \ rS rSrSrSSS.rSrSrSrS	r	S
r
S r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       rS rS rS rS rS rS rS rS rS rSrg) MTEFileReader"   z
Class for loading the content of the multext-east corpus. It
parses the xml files and does some tag-filtering depending on the
given method parameters.
zhttps://www.tei-c.org/ns/1.0z%https://www.w3.org/XML/1998/namespace)teixmlz{https://www.tei-c.org/ns/1.0}z'{https://www.w3.org/XML/1998/namespace}zTEI/text/body/div/div/p/s/(w|c)zTEI/text/body/div/div/p/szTEI/text/body/div/div/pc                     Xl         g r   )_MTEFileReader__file_path)r   	file_paths     r   r   MTEFileReader.__init__3   s    $r   c                     UR                   $ r   )textclseltcontexts      r   	_word_eltMTEFileReader._word_elt6   s    xxr   c                 z    [        USU R                  5       Vs/ s H  o0R                  US 5      PM     sn$ s  snf N*)r   r   r<   r9   r:   r;   ws       r   	_sent_eltMTEFileReader._sent_elt:   2    05c30GH0G1a&0GHHH   8c                 z    [        USU R                  5       Vs/ s H  o0R                  US 5      PM     sn$ s  snf r?   )r   r   rC   r9   r:   r;   ss       r   	_para_eltMTEFileReader._para_elt>   rE   rF   c                    SUR                   ;  a  UR                  S4$ U R                  S:X  a+  U R                  S:X  a  UR                  UR                   S   4$ U R                  S:X  a>  U R                  S:X  a.  UR                  [        R                  UR                   S   5      4$ [        R                  " S[        R                  " SSU R                  5      -   S-   5      nUR                  UR                   S   5      (       aY  U R                  S:X  a  UR                  UR                   S   4$ UR                  [        R                  UR                   S   5      4$ g )	Nana msd	universal^-.z.*$)
attribr7   _MTEFileReader__tags_MTEFileReader__tagsetMTETagConvertermsd_to_universalrecompilesubmatch)r9   r:   r;   tagss       r   _tagged_word_eltMTEFileReader._tagged_word_eltB   s   

"HHb>!:: 5HHcjj/00ZZ2#,,+"=HHo>>szz%?PQRR::cBFF3SZZ$@@5HIDzz#**U+,,<<5(HHcjj&788 '88E9JK 
 r   c                     [        [        S [        USU R                  5       Vs/ s H  o0R	                  US 5      PM     sn5      5      $ s  snf )Nc                 
    U S L$ r   r   r   s    r   r   0MTEFileReader._tagged_sent_elt.<locals>.<lambda>\   r!   r   r@   )r"   r#   r   r   r^   rA   s       r   _tagged_sent_eltMTEFileReader._tagged_sent_eltX   L    '8=c38OP8O1%%a.8OP
 	
 Q   Ac                     [        [        S [        USU R                  5       Vs/ s H  o0R	                  US 5      PM     sn5      5      $ s  snf )Nc                 
    U S L$ r   r   r   s    r   r   0MTEFileReader._tagged_para_elt.<locals>.<lambda>e   r!   r   r@   )r"   r#   r   r   rc   rH   s       r   _tagged_para_eltMTEFileReader._tagged_para_elta   re   rf   c                 t    SUR                   ;  a  UR                  S4$ UR                  UR                   S   4$ )NlemmarN   )rT   r7   r8   s      r   _lemma_word_eltMTEFileReader._lemma_word_eltj   s4    #**$HHb>!HHcjj122r   c                 z    [        USU R                  5       Vs/ s H  o0R                  US 5      PM     sn$ s  snf r?   )r   r   rn   rA   s       r   _lemma_sent_eltMTEFileReader._lemma_sent_eltq   4    6;Ccff6MN6M##At,6MNNNrF   c                 z    [        USU R                  5       Vs/ s H  o0R                  US 5      PM     sn$ s  snf r?   )r   r   rq   rH   s       r   _lemma_para_eltMTEFileReader._lemma_para_eltu   rs   rF   c                 h    [        U R                  [        R                  [        R                  5      $ r   )r   r3   r.   	word_pathr<   r   s    r   wordsMTEFileReader.wordsy   (    m55}7N7N
 	
r   c                 h    [        U R                  [        R                  [        R                  5      $ r   )r   r3   r.   	sent_pathrC   ry   s    r   sentsMTEFileReader.sents~   r|   r   c                 h    [        U R                  [        R                  [        R                  5      $ r   )r   r3   r.   	para_pathrJ   ry   s    r   parasMTEFileReader.paras   r|   r   c                 h    [        U R                  [        R                  [        R                  5      $ r   )r   r3   r.   rx   rn   ry   s    r   lemma_wordsMTEFileReader.lemma_words   (    m55}7T7T
 	
r   c                     U[         l        U[         l        [        U R                  [         R
                  [         R                  5      $ r   )r.   rV   rU   r   r3   rx   r^   r   tagsetr]   s      r   tagged_wordsMTEFileReader.tagged_words   8    !'#m55}7U7U
 	
r   c                 h    [        U R                  [        R                  [        R                  5      $ r   )r   r3   r.   r~   rq   ry   s    r   lemma_sentsMTEFileReader.lemma_sents   r   r   c                     U[         l        U[         l        [        U R                  [         R
                  [         R                  5      $ r   )r.   rV   rU   r   r3   r~   rc   r   s      r   tagged_sentsMTEFileReader.tagged_sents   r   r   c                 h    [        U R                  [        R                  [        R                  5      $ r   )r   r3   r.   r   ru   ry   s    r   lemma_parasMTEFileReader.lemma_paras   r   r   c                     U[         l        U[         l        [        U R                  [         R
                  [         R                  5      $ r   )r.   rV   rU   r   r3   r   rj   r   s      r   tagged_parasMTEFileReader.tagged_paras   r   r   )__file_pathN) r'   r(   r)   r*   r+   r   tag_nsxml_nsrx   r~   r   r   classmethodr<   rC   rJ   r^   rc   rj   rn   rq   ru   rz   r   r   r   r   r   r   r   r   r,   r   r   r   r.   r.   "   s    .6
B .F6F1I+I)I%   I I I I  * 
 
 
 
 3 3 O O O O














r   r.   c                   F    \ rS rSrSrSSSSSSS	S
SSSSS.r\S 5       rSrg)rW      zi
Class for converting msd tags to universal tags, more conversion
options are currently not implemented.
ADJADPADVCONJDETNOUNNUMPRTPRONVERBrS   X)ASRCDNMQPVrS   rR   c                 z    U S   S:X  d  U S   OU S   nU[         R                  ;  a  Sn[         R                  U   $ )z
This function converts the annotation from the Multex-East to the universal tagset
as described in Chapter 5 of the NLTK-Book

Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
r   #   rR   )rW   mapping_msd_universal)tag	indicators     r   rX    MTETagConverter.msd_to_universal   sA     #&a&C-CFSV	OAAAI44Y??r   r   N)	r'   r(   r)   r*   r+   r   staticmethodrX   r,   r   r   r   rW   rW      sK      @ @r   rW   c                       \ rS rSrSrSS jrS rSS jrSS jrSS jr	SS	 jr
SS
 jrSS jrSS jrSS jrSS jrSrg)MTECorpusReader   z
Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
scheme. These tags can be converted to the Universal tagset
Nc                 @    [         R                  " XX#5        SU l        g)a  
Construct a new MTECorpusreader for a set of documents
located at the given root directory.  Example usage:

    >>> root = '/...path to corpus.../'
    >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP

:param root: The root directory for this corpus. (default points to location in multext config file)
:param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
:param encoding: The encoding of the given files (default is utf8)
z00README.txtN)r   r   _readme)r   r
   fileidsencodings       r   r   MTECorpusReader.__init__   s     	##DB%r   c                    ^  Uc  T R                   nO[        U[        5      (       a  U/n[        U 4S jU5      n[        S U5      nU(       d  [	        S5        U$ )Nc                 "   > U TR                   ;   $ r   )_fileids)r   r   s    r   r   +MTECorpusReader.__fileids.<locals>.<lambda>   s    1#5r   c                     U S;  $ )N)zoana-bg.xmlzoana-mk.xmlr   r   s    r   r   r      s
    1,J#Jr   z$No valid multext-east file specified)r   
isinstancestrr#   print)r   r   s   ` r   	__fileidsMTECorpusReader.__fileids   sP    ?mmG%%iG5w?JGT89r   c                     [        U R                  U5       Vs/ s HD  n[        [        R                  R                  U R                  U5      5      R                  5       PMF     sn5      $ s  snf )z
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
)r   _MTECorpusReader__fileidsr.   osr   join_rootrz   r   r   fs      r   rz   MTECorpusReader.words   s]      00A bggll4::q9:@@B0
 	
   AA+c                     [        U R                  U5       Vs/ s HD  n[        [        R                  R                  U R                  U5      5      R                  5       PMF     sn5      $ s  snf )z
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of sentences or utterances,
         each encoded as a list of word strings
:rtype: list(list(str))
)r   r   r.   r   r   r   r   r   r   s      r   r   MTECorpusReader.sents  ]      00A bggll4::q9:@@B0
 	
r   c                     [        U R                  U5       Vs/ s HD  n[        [        R                  R                  U R                  U5      5      R                  5       PMF     sn5      $ s  snf )z
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of paragraphs, each encoded as a list
         of sentences, which are in turn encoded as lists of word string
:rtype: list(list(list(str)))
)r   r   r.   r   r   r   r   r   r   s      r   r   MTECorpusReader.paras  r   r   c                     [        U R                  U5       Vs/ s HD  n[        [        R                  R                  U R                  U5      5      R                  5       PMF     sn5      $ s  snf )z
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of words, the corresponding lemmas
         and punctuation symbols, encoded as tuples (word, lemma)
:rtype: list(tuple(str,str))
)r   r   r.   r   r   r   r   r   r   s      r   r   MTECorpusReader.lemma_words  s]      00A bggll4::q9:FFH0
 	
r   c                    US:X  d  US:X  ak  [        U R                  U5       Vs/ s HE  n[        [        R                  R                  U R                  U5      5      R                  X#5      PMG     sn5      $ [        S5        gs  snf )a  
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
               either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
             that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of tagged words and punctuation symbols
         encoded as tuples (word, tag)
:rtype: list(tuple(str, str))
rP   rO   Unknown tagset specified.N)	r   r   r.   r   r   r   r   r   r   r   r   r   r]   r   s        r   r   MTECorpusReader.tagged_words,       [ FeO
 "^^G4	 5 ""'',,tzz1"=>KK 5	  -.   ABc                     [        U R                  U5       Vs/ s HD  n[        [        R                  R                  U R                  U5      5      R                  5       PMF     sn5      $ s  snf )a  
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of sentences or utterances, each
         encoded as a list of tuples of the word and the corresponding
         lemma (word, lemma)
:rtype: list(list(tuple(str, str)))
)r   r   r.   r   r   r   r   r   r   s      r   r   MTECorpusReader.lemma_sentsC  ]      00A bggll4::q9:FFH0
 	
r   c                    US:X  d  US:X  ak  [        U R                  U5       Vs/ s HE  n[        [        R                  R                  U R                  U5      5      R                  X#5      PMG     sn5      $ [        S5        gs  snf )a   
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
               either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
             that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of sentences or utterances, each
         each encoded as a list of (word,tag) tuples
:rtype: list(list(tuple(str, str)))
rP   rO   r   N)	r   r   r.   r   r   r   r   r   r   r   s        r   r   MTECorpusReader.tagged_sentsR  r   r   c                     [        U R                  U5       Vs/ s HD  n[        [        R                  R                  U R                  U5      5      R                  5       PMF     sn5      $ s  snf )a=  
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of paragraphs, each encoded as a
         list of sentences, which are in turn encoded as a list of
         tuples of the word and the corresponding lemma (word, lemma)
:rtype: list(List(List(tuple(str, str))))
)r   r   r.   r   r   r   r   r   r   s      r   r   MTECorpusReader.lemma_parasi  r   r   c                    US:X  d  US:X  ak  [        U R                  U5       Vs/ s HE  n[        [        R                  R                  U R                  U5      5      R                  X#5      PMG     sn5      $ [        S5        gs  snf )a/  
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
               either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
             that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of paragraphs, each encoded as a
         list of sentences, which are in turn encoded as a list
         of (word,tag) tuples
:rtype: list(list(list(tuple(str, str))))
rP   rO   r   N)	r   r   r.   r   r   r   r   r   r   r   s        r   r   MTECorpusReader.tagged_parasx  s     [ FeO
 "^^G4	 5 ""'',,tzz1"=>KK 5	  -.r   )r   )NNutf8r   )NrO   rN   )r'   r(   r)   r*   r+   r   r   rz   r   r   r   r   r   r   r   r   r,   r   r   r   r   r      s>    &



/.
/.
/r   r   )r+   r   rY   	functoolsr   nltk.corpus.readerr   r   nltk.corpus.reader.xmldocsr   r   r   r.   rW   r   r   r   r   <module>r      sU    
 	  9 4"
M 
"H
 H
V"@ "@J|/( |/r   