
    Of$                     t    S r SSKJr  SSKJrJrJr   " S S\5      rSS jr " S S	\	5      r
 " S
 S\5      rg)zACorpus reader for the XML version of the British National Corpus.    )concat)ElementTreeXMLCorpusReaderXMLCorpusViewc                   Z    \ rS rSrSrSS jrSS jrSS jrSS jrSS jr	SS	 jr
S
 rSrg)BNCCorpusReader   a  Corpus reader for the XML version of the British National Corpus.

For access to the complete XML data structure, use the ``xml()``
method.  For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.

You can obtain the full version of the BNC corpus at
https://www.ota.ox.ac.uk/desc/2554

If you extracted the archive to a directory called `BNC`, then you can
instantiate the reader as::

    BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')

c                 >    [         R                  " XU5        X0l        g N)r   __init___lazy)selfrootfileidslazys       8/usr/lib/python3/dist-packages/nltk/corpus/reader/bnc.pyr   BNCCorpusReader.__init__   s      W5
    Nc                 *    U R                  USSX#5      $ )a  
:return: the given file(s) as a list of words
    and punctuation symbols.
:rtype: list(str)

:param strip_space: If true, then strip trailing spaces from
    word tokens.  Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
FN_viewsr   r   strip_spacestems       r   wordsBNCCorpusReader.words#   s     {{7E4CCr   c                 @    U(       a  SOSnU R                  USXSU5      $ )a  
:return: the given file(s) as a list of tagged
    words and punctuation symbols, encoded as tuples
    ``(word,tag)``.
:rtype: list(tuple(str,str))

:param c5: If true, then the tags used will be the more detailed
    c5 tags.  Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
    word tokens.  Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
c5posFr   r   r   r   r   r   tags         r   tagged_wordsBNCCorpusReader.tagged_words/   s"     de{{7E3TBBr   c                 *    U R                  USSX#5      $ )aH  
:return: the given file(s) as a list of
    sentences or utterances, each encoded as a list of word
    strings.
:rtype: list(list(str))

:param strip_space: If true, then strip trailing spaces from
    word tokens.  Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
TNr   r   s       r   sentsBNCCorpusReader.sents?   s     {{7D$BBr   c                 <    U(       a  SOSnU R                  USXSUS9$ )a  
:return: the given file(s) as a list of
    sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))

:param c5: If true, then the tags used will be the more detailed
    c5 tags.  Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
    word tokens.  Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
r   r   T)sentr!   r   r   r   r    s         r   tagged_sentsBNCCorpusReader.tagged_sentsL   s,     de{{$Ct  
 	
r   c                     U R                   (       a  [        OU R                  n[        U R	                  U5       Vs/ s H  nU" XrX4U5      PM     sn5      $ s  snf )zPA helper function that instantiates BNCWordViews or the list of words/sentences.)r   BNCWordView_wordsr   abspaths)r   r   r(   r!   r   r   ffileids           r   r   BNCCorpusReader._views]   sT    ::K4;; #mmG44F &$74
 	
s   Ac           	      z   / n[         R                  " U5      R                  5       nUR                  S5       H  n/ n	[	        U5       H  n
U
R
                  nU(       d  SnU(       d  U(       a  UR                  5       nU(       a  U
R                  SU5      nUS:X  a  XR                  S5      4nO(US:X  a"  XR                  SU
R                  S5      5      4nU	R                  U5        M     U(       a*  UR                  [        UR                  S   U	5      5        M  UR                  U	5        M     SU;  d   eU$ )a  
Helper used to implement the view methods -- returns a list of
words or a list of sentences, optionally tagged.

:param fileid: The name of the underlying file.
:param bracket_sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
z.//s hwr   r   nN)r   parsegetrootfindall_all_xmlwords_intextstripgetappendBNCSentenceattribextend)r   r0   bracket_sentr!   r   r   resultxmldocxmlsentr(   xmlwordwords               r   r-   BNCCorpusReader._wordsg   s    ""6*224~~f-GD+G4||D$::<D";;tT2D$; ++d"34DE\ ++eW[[5F"GHDD! 5 k'..*=tDEd#% .( 6!!!r   )r   )T)NTF)NFTF)NFFTF)__name__
__module____qualname____firstlineno____doc__r   r   r"   r%   r)   r   r-   __static_attributes__ r   r   r   r      s-     
DC C
"
#r   r   Nc                 ~    Uc  / nU  H1  nUR                   S;   a  UR                  U5        M&  [        X!5        M3     U$ )N)cw)r!   r=   r9   )eltrB   childs      r   r9   r9      s=    ~99
"MM% U+	 
 Mr   c                       \ rS rSrSrS rSrg)r>      z
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
c                 :    Xl         [        R                  X5        g r   )numlistr   )r   rW   itemss      r   r   BNCSentence.__init__   s    d"r   )rW   N)rH   rI   rJ   rK   rL   r   rM   rN   r   r   r>   r>      s    
#r   r>   c                   @    \ rS rSrSr1 Skr S rS rS rS r	S r
S	rg
)r,      zF
A stream backed corpus view specialized for use with the BNC corpus.
>   pbgapaligneventpauseshiftvocalunclearc                 R   U(       a  SnOSnX l         X0l        X@l        XPl        SU l        SU l        SU l        SU l        [        R                  " XU5        U R                  5         U R                  U R                  SU R                  5        U R                  5         SS0U l        g)a  
:param fileid: The name of the underlying file.
:param sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
z.*/sz.*/s/(.*/)?(c|w)Nz.*/teiHeader$r   rN   )_sent_tag_strip_space_stemtitleauthoreditorrespsr   r   _open
read_block_streamhandle_headerclose_tag_context)r   r0   r(   r!   r   r   tagspecs          r   r   BNCWordView.__init__   s     G(G
	'


tW5 	

ot7I7IJ

 Gr   c                    UR                  S5      nU(       a  SR                  S U 5       5      U l        UR                  S5      nU(       a  SR                  S U 5       5      U l        UR                  S5      nU(       a  SR                  S U 5       5      U l        UR                  S5      nU(       a  S	R                  S
 U 5       5      U l        g g )NztitleStmt/title
c              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   r:   r;   ).0rj   s     r   	<genexpr>,BNCWordView.handle_header.<locals>.<genexpr>   s     "J6%::#3#3#5#56   &(ztitleStmt/authorc              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   ry   )rz   rk   s     r   r{   r|           #NgFKK$5$5$7$7gr}   ztitleStmt/editorc              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   ry   )rz   rl   s     r   r{   r|      r   r}   ztitleStmt/respStmtz

c              3   R   #    U  H  nS R                  S U 5       5      v   M     g7f)rw   c              3   T   #    U  H  oR                   R                  5       v   M      g 7fr   ry   )rz   resp_elts     r   r{   6BNCWordView.handle_header.<locals>.<genexpr>.<genexpr>   s     EH----//r}   N)join)rz   resps     r   r{   r|      s&      %RW$		EEEERWs   %')r8   r   rj   rk   rl   rm   )r   rR   contexttitlesauthorseditorsrm   s          r   rq   BNCWordView.handle_header   s    ./"J6"JJDJ++01))#Ng#NNDK++01))#Ng#NNDK01 %RW% DJ r   c                 h    U R                   (       a  U R                  U5      $ U R                  U5      $ r   )rf   handle_senthandle_word)r   rR   r   s      r   
handle_eltBNCWordView.handle_elt   s+    ::##C((##C((r   c                    UR                   nU(       d  SnU R                  (       d  U R                  (       a  UR                  5       nU R                  (       a  UR	                  SU5      nU R
                  S:X  a  X!R	                  S5      4nU$ U R
                  S:X  a"  X!R	                  SUR	                  S5      5      4nU$ )Nr3   r4   r   r   )r:   rh   ri   r;   r<   rg   )r   rR   rF   s      r   r   BNCWordView.handle_word   s    xxD

::<D::774&D99''$-(D  YY%''%78Dr   c                    / nU H  nUR                   S;   a%  X# Vs/ s H  o@R                  U5      PM     sn-  nM8  UR                   S;   a"  UR                  U R                  U5      5        Mj  UR                   U R                  ;  d  M  [	        SUR                   -  5      e   [        UR                  S   U5      $ s  snf )N)mwhicorrtrunc)rQ   rP   zUnexpected element %sr5   )r!   r   r=   tags_to_ignore
ValueErrorr>   r?   )r   rR   r(   rS   rQ   s        r   r   BNCWordView.handle_sent   s    Eyy99e<e))!,e<<j(D,,U34$"5"55 !8599!DEE  3::c?D11 =s   B>)	rf   ri   rh   rg   rs   rk   rl   rm   rj   N)rH   rI   rJ   rK   rL   r   r   rq   r   r   r   rM   rN   r   r   r,   r,      s-    	N$@()	2r   r,   r   )rL   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   r   r   r9   rX   r>   r,   rN   r   r   <module>r      sB    H * R R|o |~#$ #f2- f2r   