
    Of                         S SK r S SKJrJr  S SKJrJrJr  S SKJ	r	  S r
 " S S\5      rS rS	 r\S
:X  a  \" 5         gg)    N)CorpusReaderSyntaxCorpusReader)FileSystemPathPointerfind_corpus_fileidsread_blankline_block)DependencyGraphc                 2    SR                  S U  5       5      $ )N/c              3   @   #    U  H  oS    S:w  d  M  US    v   M     g7f)r   EOSN .0ms     9/usr/lib/python3/dist-packages/nltk/corpus/reader/knbc.py	<genexpr><lambda>.<locals>.<genexpr>   s     -TFqdemdadFs   joinmorphss    r   <lambda>r      s    SXX-TF-T%T    c                   B    \ rS rSrSrS\4S jrS rS rSS jr	S	 r
S
rg)KNBCorpusReader   aG  
This class implements:
  - ``__init__``, which specifies the location of the corpus
    and a method for detecting the sentence blocks in corpus files.
  - ``_read_block``, which reads a block from the input stream.
  - ``_word``, which takes a block and returns a list of list of words.
  - ``_tag``, which takes a block and returns a list of list of tagged
    words.
  - ``_parse``, which takes a block and returns a list of parsed
    sentences.

The structure of tagged words:
  tagged_word = (word(str), tags(tuple))
  tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)

Usage example

>>> from nltk.corpus.util import LazyCorpusLoader
>>> knbc = LazyCorpusLoader(
...     'knbc/corpus1',
...     KNBCorpusReader,
...     r'.*/KN.*',
...     encoding='euc-jp',
... )

>>> len(knbc.sents()[0])
9

utf8c                 >    [         R                  " XX#5        X@l        g)zv
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
N)r   __init__
morphs2str)selfrootfileidsencodingr    s        r   r   KNBCorpusReader.__init__7   s     	##DB$r   c                     [        U5      $ N)r   )r!   streams     r   _read_blockKNBCorpusReader._read_block@   s    #F++r   c                     / nUR                  5        HT  n[        R                  " SU5      (       a  M!  UR                  5       R	                  S5      nUR                  US   5        MV     U$ )NEOS|\*|\#|\+ r   )
splitlinesrematchstripsplitappend)r!   treslinecellss        r   _wordKNBCorpusReader._wordD   sU    LLND88OT22

**3/

58$	 # 
r   Nc           	         / nUR                  5        Hh  n[        R                  " SU5      (       a  M!  UR                  5       R	                  S5      nUR                  US   SR                  USS  5      45        Mj     U$ )Nr,   r-   r      )r.   r/   r0   r1   r2   r3   r   )r!   r4   tagsetr5   r6   r7   s         r   _tagKNBCorpusReader._tagO   si    LLND88OT22

**3/

E!HchhuQRy&9:; # 
r   c                 ^   [        5       nSnUR                  5        GH4  nUS   S;   a  UR                  5       R                  SS5      n[        R
                  " SUS   5      nUc   eUR                  U   nUR                  X6R                  S5      / S.5        [        UR                  S5      5      nUS	:X  a  Xrl
        O!UR                  U   S
   R                  U5        US-  nM  US   S:w  d  M  UR                  5       R                  S5      nUS   SR                  USS  5      4n	UR                  US-
     S   R                  U	5        GM7     U R                  (       a8  UR                  R                  5        H  nU R                  US   5      US'   M     UR                  5       $ )Nr   z*+r-      z([\-0-9]*)([ADIP])r;      )addressrelworddeps#rD   )r   r.   r1   r2   r/   r0   nodesupdategroupintr"   r3   r   r    valuestree)
r!   r4   dgir6   r7   r   node
dep_parentmorphs
             r   _parseKNBCorpusReader._parseZ   sk   LLNDAw$ 

**32HH2E!H=}$}xx{''!*bIJ _
#"GHHZ(077:QaC

**3/a#((59"55Q'..u53 #6 ??)#tF|<V * wwyr   )r    r'   )__name__
__module____qualname____firstlineno____doc___morphs2str_defaultr   r)   r8   r=   rS   __static_attributes__r   r   r   r   r      s(    < 06BU %,	"r   r   c            	         SS K n SSKJn  U R                  R	                  S5      n[        [        U5      S5       Vs/ s H#  n[        R                  " SU5      (       d  M!  UPM%     nnS nU" S[        [        XES9S	S
9n[        UR                  5       S S 5        [        SR                  UR                  5       S S 5      5        [        SR                  S UR                  5       S S  5       5      5        S Ul        [        SR                  S UR                  5       S S  5       5      5        [        SR                  S UR#                  5       SS  5       5      5        g s  snf )Nr   LazyCorpusLoaderzcorpora/knbc/corpus1z.*z\d\-\d\-[\d]+\-[\d]+c                 ~    U R                  S5      nUS   [        US   5      [        US   5      [        US   5      4$ )N-r   r;   rA   r@   )r2   rK   )xr7   s     r   _knbc_fileids_sort demo.<locals>._knbc_fileids_sort   s:    a#eAh-U1XE!HFFr   knbc/corpus1)keyeuc-jpr$   
    d   z

c              3   8   #    U  H  n[        U5      v   M     g 7fr'   )strr   rM   s     r   r   demo.<locals>.<genexpr>   s     D,CDc$ii,Cs   rA   c                 P    SR                  S U  5       5      R                  S5      $ )Nr
   c              3      #    U  H9  oS    S:w  d  M  SR                  US    US   R                  S5      S   5      v   M;     g7f)r   r   z{}({})r;   r-   rA   Nformatr2   r   s     r   r   )demo.<locals>.<lambda>.<locals>.<genexpr>   sA      .;AaqTU]1!adjjoa0116s
   A1Azutf-8)r   encoder   s    r   r   demo.<locals>.<lambda>   s(    SXX .;A. &fWo&r   c              3   ,   #    U  H
  nS U-  v   M     g7f)z%sNr   rm   s     r   r   rn      s     F.EddTk.Es   
c              3   R   #    U  H  nS R                  S U 5       5      v   M     g7f)r-   c              3   v   #    U  H/  nS R                  US   US   R                  S5      S   5      v   M1     g7f)z{}/{}r   r;   r-   rA   Nrq   )r   ws     r   r   !demo.<locals>.<genexpr>.<genexpr>   s3     Lt!W^^AaD!A$**S/!*<==ts   79Nr   )r   sents     r   r   rn      s)      
0 HHLtLLL0s   %')nltknltk.corpus.utilr^   datafindr   r   r/   searchr   sortedprintr#   r   wordsparsed_sentsr    tagged_sents)r}   r^   r"   fr#   rb   knbcs          r   demor      s@   199>>01D %%:4%@$GGA99,a0 	
G  G w/	D 
$,,."
	"''$**,t$
%&	&++DD,=,=,?,CD
DEDO 
&++Fd.?.?.A"1.EF
FG			 
))+Aa0
 	
9s    E&!E&c                  v   SSK Jn   U " S[        SSS9n[        UR	                  5       S   [
        5      (       d   e[        UR                  5       S   S   [
        5      (       d   e[        UR                  5       S   [        5      (       d   e[        UR                  5       S   S   [        5      (       d   eg )Nr   r]   rd   z.*/KN.*rf   rg   )
r~   r^   r   
isinstancer   rl   sentstagged_wordstupler   )r^   r   s     r   testr      s    1hD djjl1os++++djjl1oa(#....d'')!,e4444d'')!,Q/7777r   __main__)r/   nltk.corpus.reader.apir   r   nltk.corpus.reader.utilr   r   r   
nltk.parser   rZ   r   r   r   rU   r   r   r   <module>r      sT    
 C 
 ' U d( dX&R	8 zF r   