
    Of>                         S r SSKrSSKJr  SSKJr  SSK7  SSKJr  SSK	J
r
  SSKJr   " S	 S
\5      r " S S\5      rg)z
Corpus reader for corpora whose documents are xml files.

(note -- not named 'xml' to avoid conflicting w/ standard xml package)
    N)ElementTree)CorpusReader)*)SeekableUnicodeStreamReader)ElementWrapper)WordPunctTokenizerc                   6    \ rS rSrSrSS jrS	S jrS	S jrSrg)
XMLCorpusReader   z
Corpus reader for corpora whose documents are xml files.

Note that the ``XMLCorpusReader`` constructor does not take an
``encoding`` argument, because the unicode encoding is specified by
the XML files themselves.  See the XML specs for more info.
c                 >    X0l         [        R                  " XU5        g N)_wrap_etreer   __init__)selfrootfileids
wrap_etrees       </usr/lib/python3/dist-packages/nltk/corpus/reader/xmldocs.pyr   XMLCorpusReader.__init__!   s    %d'2    Nc                    Uc(  [        U R                  5      S:X  a  U R                  S   n[        U[        5      (       d  [	        S5      eU R                  U5      R                  5        n[        R                  " U5      R                  5       nS S S 5        U R                  (       a  [        W5      nW$ ! , (       d  f       N,= f)N   r   z(Expected a single file identifier string)len_fileids
isinstancestr	TypeErrorabspathopenr   parsegetrootr   r   )r   fileidfpelts       r   xmlXMLCorpusReader.xml%   s    >c$--0A5]]1%F&#&&FGG\\&!&&(B##B'//1C )  %C
 )(s   +%B66
Cc                 v   U R                  U5      nU R                  U5      n[        5       n UR                  5       n/ nU H\  nUR
                  nUc  M  [        U[        5      (       a  UR                  U5      nUR                  U5      n	UR                  U	5        M^     U$ !   UR	                  5       n Nz= f)a  
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.

:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
)r%   encodingr   getiteratoritertextr   bytesdecodetokenizeextend)
r   r"   r$   r(   word_tokenizeriteratoroutnoder+   tokss
             r   wordsXMLCorpusReader.words4   s     hhv==(+-	"(H D99DdE**;;x0D%..t4

4   
	"xxzHs   B$ $B8)r   )Fr   )	__name__
__module____qualname____firstlineno____doc__r   r%   r5   __static_attributes__ r   r   r
   r
      s    3r   r
   c                      \ rS rSrSrSrSrSS jrS rS r	\
R                  S	\
R                  \
R                  -  5      r\
R                  S
5      r\
R                  S\
R                  \
R                  -  5      rS rSS jrSrg)XMLCorpusViewQ   a  
A corpus view that selects out specified elements from an XML
file, and provides a flat list-like interface for accessing them.
(Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
but may be used by subclasses of ``XMLCorpusReader``.)

Every XML corpus view has a "tag specification", indicating what
XML elements should be included in the view; and each (non-nested)
element that matches this specification corresponds to one item in
the view.  Tag specifications are regular expressions over tag
paths, where a tag path is a list of element tag names, separated
by '/', indicating the ancestry of the element.  Some examples:

  - ``'foo'``: A top-level element whose tag is ``foo``.
  - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
    is a top-level element whose tag is ``foo``.
  - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
    in the xml tree.
  - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
    appearing anywhere in the xml tree.

The view items are generated from the selected XML elements via
the method ``handle_elt()``.  By default, this method returns the
element as-is (i.e., as an ElementTree object); but it can be
overridden, either via subclassing or via the ``elt_handler``
constructor parameter.
Fi   Nc                     U(       a  X0l         [        R                  US-   5      U l         SS0U l         U R                  U5      n[        R                  XUS9  g)a  
Create a new corpus view based on a specified XML file.

Note that the ``XMLCorpusView`` constructor does not take an
``encoding`` argument, because the unicode encoding is
specified by the XML files themselves.

:type tagspec: str
:param tagspec: A tag specification, indicating what XML
    elements should be included in the view.  Each non-nested
    element that matches this specification corresponds to one
    item in the view.

:param elt_handler: A function used to transform each element
    to a value for the view.  If no handler is specified, then
    ``self.handle_elt()`` is called, which returns the element
    as an ElementTree object.  The signature of elt_handler is::

        elt_handler(elt, tagspec) -> value
z\Zr   r=   )r(   N)
handle_eltrecompile_tagspec_tag_context_detect_encodingStreamBackedCorpusViewr   )r   r"   tagspecelt_handlerr(   s        r   r   XMLCorpusView.__init__u   sY    * )O

7U?39G	
 ((0''x'Hr   c                    [        U[        5      (       a2   UR                  5       nUR                  5       nUR	                  5         O%[        US5       nUR                  5       nS S S 5        WR                  [        R                  5      (       a  gUR                  [        R                  5      (       a  gUR                  [        R                  5      (       a  gUR                  [        R                  5      (       a  gUR                  [        R                  5      (       a  g[        R                  SU5      nU(       a  UR                  S5      R                  5       $ [        R                  S	U5      nU(       a  UR                  S5      R                  5       $ g! WR	                  5         f = f! , (       d  f       GNV= f)
Nrbz	utf-16-bez	utf-16-lez	utf-32-bez	utf-32-lezutf-8s!   \s*<\?xml\b.*\bencoding="([^"]+)"r   s!   \s*<\?xml\b.*\bencoding='([^']+)')r   PathPointerr   readlineclose
startswithcodecsBOM_UTF16_BEBOM_UTF16_LEBOM_UTF32_BEBOM_UTF32_LEBOM_UTF8rC   matchgroupr-   )r   r"   infilesms        r   rG   XMLCorpusView._detect_encoding   s4   fk**OO%fd#vOO% $<<++,,<<++,,<<++,,<<++,,<<((HH:A>771:$$&&HH:A>771:$$&&+ ##s    F F4F14
Gc                     U$ )a  
Convert an element into an appropriate value for inclusion in
the view.  Unless overridden by a subclass or by the
``elt_handler`` constructor argument, this method simply
returns ``elt``.

:return: The view value corresponding to ``elt``.

:type elt: ElementTree
:param elt: The element that should be converted.

:type context: str
:param context: A string composed of element tags separated by
    forward slashes, indicating the XML context of the given
    element.  For example, the string ``'foo/bar/baz'``
    indicates that the element is a ``baz`` element whose
    parent is a ``bar`` element and whose grandparent is a
    top-level ``foo`` element.
r=   )r   r$   contexts      r   rB   XMLCorpusView.handle_elt   s	    ( 
r   a;  
        [^<]*
        (
          ((<!--.*?-->)                         |  # comment
           (<![CDATA[.*?]])                     |  # raw character data
           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl
           (<[^!>][^>]*>))                         # tag or PI
          [^<]*)*
        \Zz<\s*(?:/\s*)?([^\s>]+)a6  
        # Include these so we can skip them:
        (?P<COMMENT>        <!--.*?-->                          )|
        (?P<CDATA>          <![CDATA[.*?]]>                     )|
        (?P<PI>             <\?.*?\?>                           )|
        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
        # These are the ones we actually care about:
        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )|
        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )|
        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )c                    Sn[        U[        5      (       a  UR                  5       n UR                  U R                  5      nX$-  nU R
                  R                  U5      (       a  U$ [        R                  SU5      R                  S5      S:X  aO  UR                  5       [        U5      [        R                  SU5      R                  5       -
  -
  n[        SU-  5      eU(       d  [        S5      eUR                  S5      nUS:  a  U R
                  R                  USU 5      (       a\  [        U[        5      (       a#  UR                  W5        UR                  U5        OUR                  [        U5      U-
  * S	5        USU $ GMd  )
aC  
Read a string from the given stream that does not contain any
un-closed tags.  In particular, this function first reads a
block from the stream of size ``self._BLOCK_SIZE``.  It then
checks if that block contains an un-closed tag.  If it does,
then this function either backtracks to the last '<', or reads
another block.
 z[<>]r   >zUnexpected ">" near char %sz&Unexpected end of file: tag not closed<Nr   )r   r   tellread_BLOCK_SIZE_VALID_XML_RErX   rC   searchrY   r   end
ValueErrorrfindseekchar_seek_forward)r   streamfragmentstartpos	xml_blockposlast_open_brackets          r   _read_xml_fragment XMLCorpusView._read_xml_fragment   sd    f9::{{}HD$4$45I!H !!''11 yy*003s:kkmMBIIfh$?$C$C$EE !!>!DEE  !IJJ
 !)s 3 1$%%++H5G6G,HII!&*EFFH-001BCc(m6G&G$H!L#$6%677? r   c                 d	   Uc  U R                   nUc  U R                  n[        U R                  R	                  UR                  5       5      5      nUc   e/ nSnSnSnU/ :X  d  UGb  [        U[        5      (       a  UR                  5       n	U R                  U5      n
U
(       d  Uc  GOj[        S5      eU R                  R                  U
5       GH  nU R                  (       a;  [        SR                  SR                  U5      SS UR!                  5       5      5        UR!                  S5      (       a  U R"                  R%                  UR!                  5       5      R!                  S5      nUR'                  U5        UcI  [(        R%                  USR                  U5      5      (       a  UR+                  5       n[-        U5      nM  M  M  UR!                  S	5      (       a  U R"                  R%                  UR!                  5       5      R!                  S5      nU(       d  [        S
U-  5      eXS   :w  a  [        SUS    SU S35      eUbJ  U[-        U5      :X  a;  XXkR/                  5        -  nUR'                  USR                  U5      45        S=pgSnUR1                  5         GM  UR!                  S5      (       d  GM  U R"                  R%                  UR!                  5       5      R!                  S5      nUb  GM4  [(        R%                  USR                  U5      S-   U-   5      (       d  GMg  UR'                  UR!                  5       SR                  U5      S-   U-   45        GM     Ub  U/ :X  a
  XUS -  nSnOU R                  (       a  [        S5        [        U[        5      (       a#  UR3                  W	5        UR5                  U5        OUR3                  [-        U
5      U-
  * S5        USUS-
   nS=pgSnU/ :X  a  GM  Ub  GM  UR                  5       nXR                  ;   a  [7        U5      U R                  U   :X  d   eO[7        U5      U R                  U'   U VVs/ s H2  u  pU" [8        R:                  " UR=                  SS5      5      U5      PM4     snn$ s  snnf )z
Read from ``stream`` until we find at least one element that
matches ``tagspec``, and return the result of applying
``elt_handler`` to each element found.
Nrb   zUnexpected end of filez	{:>25} {}/i	START_TAGr   END_TAGzUnmatched tag </%s>zUnmatched tag <z>...</rc   EMPTY_ELT_TAGr   z/                                    (backtrack)asciixmlcharrefreplace)rE   rB   listrF   getre   r   r   ru   rk   
_XML_PIECEfinditer_DEBUGprintformatjoinrY   _XML_TAG_NAMErX   appendrC   startr   rj   poprm   rn   tupler   
fromstringencode)r   ro   rI   rJ   r_   elts	elt_start	elt_depthelt_textrq   xml_fragmentpiecenamers   r$   s                  r   
read_blockXMLCorpusView.read_block"  s    ?mmG//K t((,,V[[];<"""		bjI1&"=>>!;;=226:L  $$%=>> 11,?;;+,,SXXg->st-DekkmTU;;{++--33EKKMBHHKDNN4( (88GSXXg->??(-I(+GI @ )
 [[++--33EKKMBHHKD"()>)EFFr{*(?72;-vdVST)UVV ,c'l1J YY[$IIXsxx/@$AB044	#%KKM[[11--33EKKMBHHKD (88GSXXg->-Dt-KLL KK8IC8ORV8V(WXE @H $ 2:YZ 88H !I {{67!&*EFFH-00;c,&7)&C$DaH%o	A6G,00I!HO bjI1T kkm###>T%6%6s%;;;;%*7^Dc" #'

 #'	 &&szz';N'OP #'
 	
 
s   /9R,)rF   rE   rB   r   )NN)r7   r8   r9   r:   r;   r   rg   r   rG   rB   rC   rD   DOTALLVERBOSErh   r   r   ru   r   r<   r=   r   r   r?   r?   Q   s    < F K"IH:0 JJ	 			BJJM JJ89M 		E 			BJJJ,8bk
r   r?   )r;   rR   	xml.etreer   nltk.corpus.reader.apir   nltk.corpus.reader.util	nltk.datar   nltk.internalsr   nltk.tokenizer   r
   rH   r?   r=   r   r   <module>r      s=     ! / % 1 ) ,6l 6r|
* |
r   