
    Of{c                         S r SrSSKrSSKJr  SSKJr  SSKJrJ	r	  SSK
JrJrJr  Sr " S	 S
\	5      rSS jr\S:X  a  \" 5         gg)z:
Corpus reader for the XML version of the CHILDES corpus.
z
epytext en    Ndefaultdict)concat)ElementTreeXMLCorpusReader)LazyConcatenationLazyMapflattenz#http://www.talkbank.org/ns/talkbankc                       \ rS rSrSrSS jr      SS jr      SS jr      SS jr      SS jr	SS	 jr
S
 rSS jrS rSS jrS rS rSS jrS rS r SrSS jrSrg)CHILDESCorpusReader   a  
Corpus reader for the XML version of the CHILDES corpus.
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
(``nltk_data/corpora/CHILDES/``).

For access to the file text use the usual nltk functions,
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
c                 >    [         R                  " XU5        X0l        g N)r   __init___lazy)selfrootfileidslazys       </usr/lib/python3/dist-packages/nltk/corpus/reader/childes.pyr   CHILDESCorpusReader.__init__&   s      W5
    Nc                   ^ ^^^^^^	^
 Sm
Sm	T R                   (       d8  T R                  U5       Vs/ s H  nT R                  UTT
TTT	TT5      PM     sn$ U	UUU U
UUU4S jn[        [	        UT R                  U5      5      5      $ s  snf )a  
:return: the given file(s) as a list of words
:rtype: list(str)

:param speaker: If specified, select specific speaker(s) defined
    in the corpus. Default is 'ALL' (all participants). Common choices
    are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
    researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of (stem, index,
    dependent_index)
:param strip_space: If true, then strip trailing spaces from word
    tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
    of the original word (e.g., 'wat' will be replaced with 'watch')
NFc           
      4   > TR                  U TTTTTTT5      $ r   
_get_words	fileidposrelationreplacer   sentspeakerstemstrip_spaces	    r   <lambda>+CHILDESCorpusReader.words.<locals>.<lambda>M       4??GT43W$
r   r   abspathsr   r   r	   r   r   r#   r$   r    r%   r!   r   	get_wordsr   r"   s   ` `````  @@r   wordsCHILDESCorpusReader.words*   s    2 zz
 #mmG4	 5F GT43W 5	 
 
	 !DMM'4J!KLL   "Bc                   ^ ^^^^^^	^
 Sm
Sm	T R                   (       d8  T R                  U5       Vs/ s H  nT R                  UTT
TTT	TT5      PM     sn$ U	UUU U
UUU4S jn[        [	        UT R                  U5      5      5      $ s  snf )a  
:return: the given file(s) as a list of tagged
    words and punctuation symbols, encoded as tuples
    ``(word,tag)``.
:rtype: list(tuple(str,str))

:param speaker: If specified, select specific speaker(s) defined
    in the corpus. Default is 'ALL' (all participants). Common choices
    are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
    researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of (stem, index,
    dependent_index)
:param strip_space: If true, then strip trailing spaces from word
    tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
    of the original word (e.g., 'wat' will be replaced with 'watch')
NTc           
      4   > TR                  U TTTTTTT5      $ r   r   r   s	    r   r&   2CHILDESCorpusReader.tagged_words.<locals>.<lambda>w   r(   r   r)   r+   s   ` `````  @@r   tagged_words CHILDESCorpusReader.tagged_wordsR       6 zz
 #mmG4	 5F GT43W 5	 
 
	 !DMM'4J!KLLr/   c                   ^ ^^^^^^	^
 Sm
Sm	T R                   (       d8  T R                  U5       Vs/ s H  nT R                  UTT
TTT	TT5      PM     sn$ U	UUU U
UUU4S jn[        [	        UT R                  U5      5      5      $ s  snf )a  
:return: the given file(s) as a list of sentences or utterances, each
    encoded as a list of word strings.
:rtype: list(list(str))

:param speaker: If specified, select specific speaker(s) defined
    in the corpus. Default is 'ALL' (all participants). Common choices
    are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
    researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
    If there is manually-annotated relation info, it will return
    tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
:param strip_space: If true, then strip trailing spaces from word
    tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
    of the original word (e.g., 'wat' will be replaced with 'watch')
TFc           
      4   > TR                  U TTTTTTT5      $ r   r   r   s	    r   r&   +CHILDESCorpusReader.sents.<locals>.<lambda>   r(   r   r)   r+   s   ` `````  @@r   sentsCHILDESCorpusReader.sents|   s    6 zz
 #mmG4	 5F GT43W 5	 
 
	 !DMM'4J!KLLr/   c                   ^ ^^^^^^	^
 Sm
Sm	T R                   (       d8  T R                  U5       Vs/ s H  nT R                  UTT
TTT	TT5      PM     sn$ U	UUU U
UUU4S jn[        [	        UT R                  U5      5      5      $ s  snf )a  
:return: the given file(s) as a list of
    sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))

:param speaker: If specified, select specific speaker(s) defined
    in the corpus. Default is 'ALL' (all participants). Common choices
    are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
    researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
    If there is manually-annotated relation info, it will return
    tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
:param strip_space: If true, then strip trailing spaces from word
    tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
    of the original word (e.g., 'wat' will be replaced with 'watch')
Tc           
      4   > TR                  U TTTTTTT5      $ r   r   r   s	    r   r&   2CHILDESCorpusReader.tagged_sents.<locals>.<lambda>   r(   r   r)   r+   s   ` `````  @@r   tagged_sents CHILDESCorpusReader.tagged_sents   r5   r/   c                     U R                   (       d0  U R                  U5       Vs/ s H  o R                  U5      PM     sn$ [        U R                  U R                  U5      5      $ s  snf )z]
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
:rtype: list(dict)
)r   r*   _get_corpusr	   r   r   r   s      r   corpusCHILDESCorpusReader.corpus   sV    
 zz;?==;QR;Q$$V,;QRRt''w)?@@ S   A'c                     [        5       n[        R                  " U5      R                  5       nUR	                  5        H	  u  pEXRU'   M     U$ r   )dictr   parsegetrootitems)r   r   resultsxmldockeyvalues         r   rA   CHILDESCorpusReader._get_corpus   s@    &""6*224 ,,.JC CL )r   c                     U R                   (       d0  U R                  U5       Vs/ s H  o R                  U5      PM     sn$ [        U R                  U R                  U5      5      $ s  snf )zf
:return: the given file(s) as a dict of
    ``(participant_property_key, value)``
:rtype: list(dict)
)r   r*   _get_participantsr	   rB   s      r   participants CHILDESCorpusReader.participants   sV     zzAEwAWXAWv**62AWXXt--t}}W/EFF YrE   c                   ^ U4S jm[         R                  " U5      R                  5       nT" 5       nUR                  S[         S[         S35       H2  nUR                  5        H  u  pVXcUR                  S5         U'   M     M4     U$ )Nc                     > [        T 5      $ r   r   )dictOfDictss   r   rV   :CHILDESCorpusReader._get_participants.<locals>.dictOfDicts   s    {++r   .//{}Participants/{}participantid)r   rH   rI   findallNSrJ   get)r   r   rL   patparticipantrM   rN   rV   s          @r   rQ   %CHILDESCorpusReader._get_participants   s    	, ""6*224m!>>B4(M:
K *//1
27KOOD)*3/ 2

 
r   c                    ^ ^^ T R                   (       d3  T R                  U5       Vs/ s H  nT R                  UTT5      PM     sn$ UU U4S jn[        UT R                  U5      5      $ s  snf )z
:return: the given file(s) as string or int
:rtype: list or int

:param month: If true, return months instead of year-month-date
c                 *   > TR                  U TT5      $ r   )_get_age)r   monthr   r#   s    r   r&   )CHILDESCorpusReader.age.<locals>.<lambda>  s    vw!Fr   )r   r*   rd   r	   )r   r   r#   re   r   get_ages   ` ``  r   ageCHILDESCorpusReader.age   sg     zz #mmG44F fgu54  Gwg 677s   A+c                 \   [         R                  " U5      R                  5       nUR                  S[         S[         S35       HF  n UR                  S5      U:X  a-  UR                  S5      nU(       a  U R                  U5      nUs  $ MH     g ! [        [        4 a  n S nA  g S nAff = f)NrX   rY   rZ   r[   rh   )	r   rH   rI   r\   r]   r^   convert_age	TypeErrorAttributeError)r   r   r#   re   rL   r_   rh   es           r   rd   CHILDESCorpusReader._get_age  s    ""6*224>>E"->rd-"PQC774=G+''%.C"..s3J	 , R ~. s   	?BB+&B+c                    [         R                  " SU5      n[        UR                  S5      5      S-  [        UR                  S5      5      -   n [        UR                  S5      5      S:  a  US-  nU$ ! [         a  n SnAU$ SnAff = f)z8Caclculate age in months from a string in CHILDES formatzP(\d+)Y(\d+)M?(\d?\d?)D?               N)rematchintgroup
ValueError)r   age_yearm	age_monthrn   s        r   rk   CHILDESCorpusReader.convert_age  s    HH0(;
Ob(3qwwqz?:		1771:#Q	   		s   #A6 6
BBc                    ^ ^ T R                   (       d0  T R                  U5       Vs/ s H  nT R                  UTS9PM     sn$ U U4S jn[        UT R                  U5      5      $ s  snf )zE
:return: the given file(s) as a floating number
:rtype: list(float)
r#   c                 $   > TR                  U TS9$ )Nr   )_getMLU)r   r   r#   s    r   r&   )CHILDESCorpusReader.MLU.<locals>.<lambda>+  s    fg!Fr   )r   r*   r   r	   )r   r   r#   r   get_MLUs   ` `  r   MLUCHILDESCorpusReader.MLU!  sg    
 zz #mmG44F VW54  Gwg 677s   A&c                    U R                  UUSSSSSSS9n/ n/ nSnSnU H  nU V	V
s/ s H  u  pU
PM	     nn	n
[        S U 5       5      (       a  M2  U/ :X  a  M:  X:X  a  MA  UR                  U V	V
s/ s H  u  pU	PM	     sn
n	5        [        SS 1R	                  U5      5      S:  a+  XkR                  S5      -  nXkR                  S 5      -  nUS-  nUnM      [        U5      n[        [        U V	s/ s H  oR                  S5      PM     sn	5      5      U-
  n[        U5      U-
  nX-  nU$ s  sn
n	f s  sn
n	f s  sn	f ! [         a    Sn U$ f = f)	NTF)r#   r"   r$   r    r   r%   r!   r   c              3   *   #    U  H	  oS :H  v   M     g7f)unkN ).0r   s     r   	<genexpr>.CHILDESCorpusReader._getMLU.<locals>.<genexpr>@  s     37C%<7s   corq   -)	r   anyappendlenintersectioncountr
   splitZeroDivisionError)r   r   r#   r9   rK   lastSent
numFillerssentDiscountr"   wordr   posListthisWordListnumWordsnumSentsmlus                   r   r   CHILDESCorpusReader._getMLU.  s{      	
 
D.23d{sdG337333!=$=>d|009:Q>--"55J--"55J A%LH% &
	"7+L GFZZ_FGH:U  7|l2H%C 
= 4  > G ! 	C
	s/   D/2D5E  3D; E  ;E   EEc	                    [        U[        5      (       a	  US:w  a  U/n[        R                  " U5      R	                  5       n	/ n
U	R                  S[        -  5       GH  n/ nUS:X  d  UR                  S5      U;   d  M#  UR                  S[        -  5       GH  nS nS nS nU(       aM  UR                  S[         S[         S35      (       a(  UR                  S[         S[         S[         S	35      nOLU(       aE  UR                  S[         S[         S
35      (       a   UR                  S[         S[         S
35      nUR                  (       a  UR                  nOSnU(       a  UR                  5       nU(       d  U(       a   UR                  S[        -  5      nUR                  n UR                  S[         S[         S[         S35      nUSUR                  -   -  n UR                  S[        < S[        < S[        < S[        < S3	5      nUR                  nU(       a  USU-   -  nU(       d  U(       Ga,   UR                  S[        -  5      nUR                  S[        -  5      nU/ :w  a#  US   R                  S-   US   R                  -   nOUS   R                  n  UR                  S[        < S[        < S[        < S[        < S[        < S35      nUR                  S[        < S[        < S[        < S[        < S[        < S35      nU(       a#  US   R                  S-   US   R                  -   nOUS   R                  n U(       a  USU-   -  nUU4nUS:X  Ga  UR                  S[         S[         S35       H  nUR                  S5      S:X  dF  US   US   UR                  S 5      S!-   UR                  S"5      -   S!-   UR                  S#5      -   4nM^  US   US   US$   US   US   UR                  S 5      S!-   UR                  S"5      -   S!-   UR                  S#5      -   4nM      UR                  S[         S[         S[         S35       H  nUR                  S5      S:X  dF  US   US   UR                  S 5      S!-   UR                  S"5      -   S!-   UR                  S#5      -   4nM^  US   US   US$   US   US   UR                  S 5      S!-   UR                  S"5      -   S!-   UR                  S#5      -   4nM     UR                  U5        GM     U(       d  U(       a  U
R                  U5        GM  U
R                  U5        GM     [        S% U
5      $ ! [         a  n S nAGNS nAff = f!    GN= f! [         a    Sn GNf = f! [        [        4 a  nSn S nAGNS nAff = f!    GNy= f!    N= f)&NALLz.//{%s}uwhoz.//{%s}wrX   z}w/{z}replacementz}replacement/{z}wz}wk z.//{%s}stemz}mor/{z}mw/{z}mkr   z}mor-post/{z}stem~z.//{%s}cz.//{%s}sr   :z}pos/{z}cz}sTz}gratypegrtrq   index|headr    rs   c                     U $ r   r   )xs    r   r&   0CHILDESCorpusReader._get_words.<locals>.<lambda>  s    r   )
isinstancestrr   rH   rI   r\   r]   r^   findtextstriprm   
IndexErrorr   extendr	   )r   r   r#   r"   r$   r    r   r%   r!   rL   rK   xmlsentr9   xmlwordinfl
suffixStem	suffixTagr   xmlstemrn   xmlinfl	xmlsuffixxmlposxmlpos2tagxmlsuffixposxmlsuffixpos2xmlstem_relxmlpost_rels                                r   r   CHILDESCorpusReader._get_words^  s    w$$E)9iG""6*224~~j2o6GE%7;;u#5#@&zB?GD!%J $I7<<%t6"]0S#T#T"),,#B4vbT1A"SI# !W\\E"VB4t2L%M%M"),,rd&D/I"J||&||!"#zz|4!&-ll=23E&FG#*<<D!&-ll"'t8B4wrd$ G'G !C',,$66D,(/#%r2r!3)I *3J & C*$44D3%%,__Z"_%EF&-ooj2o&FG&"}&,Qinns&:WQZ__&L&,Qinn!+2??#%r2r2!7,L -4OO#%r2r2!7-M  -$0O$8$83$>qAQAVAV$V !* -9O,@,@	 %3?2C $c{  4'+2??#B4xt59,K $/??6#:e#C$(G$(G$/OOG$<&)%*&1oof&=%> '*%* '2ooj&A	%B(" %)G$(G$(G$(G$(G$/OOG$<&)%*&1oof&=%> '*%* '2ooj&A	%B(",4!/6"'t8B4}RD N0 (3v'>%'G(21(21(3(@*-).*5//&*A)B +.). +6//**E	)F2&J )31(21(21(21(21(3(@*-).*5//&*A)B +.). +6//**E	)F2&J08 LL&O  @P 8NN5)NN5)_ 7` {G,,i  . ! !!   . ,)+J, !/
; %"$C%"! z! so   $U=-9V'>VAV0V0.BWW	CW=
VVVV-,V-0W WWWWz3https://childes.talkbank.org/browser/index.php?url=c                 
   SSK nU(       a	  US-   U-   nOU R                  S-   U-   n[        R                  " SSU5      nSUR	                  5       ;   a  [        R
                  " SU5      S   nO4SUR	                  5       ;   a  S[        R
                  " S	U5      S   -   nOUnUR                  S
5      (       a  USS nUR                  S5      (       d  US-   nU R                  U-   nUR                  U5        [        SU5        g)aU  Map a corpus file to its web version on the CHILDES website,
and open it in a web browser.

The complete URL to be used is:
    childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')

If no urlbase is passed, we try to calculate it.  This
requires that the childes corpus was set up to mirror the
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???

The function first looks (as a special case) if "Eng-USA" is
on the path consisting of <corpus root>+fileid; then if
"childes", possibly followed by "data-xml", appears. If neither
one is found, we use the unmodified fileid and hope for the best.
If this is not right, specify urlbase explicitly, e.g., if the
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
r   N/z\\z	/childes/z$(?i)/childes(?:/data-xml)?/(.*)\.xmlzeng-usazEng-USA/z/(?i)Eng-USA/(.*)\.xmlz.xmlz.chazOpening in browser:)

webbrowserr   rv   sublowerr\   endswithchildes_url_baseopen_new_tabprint)r   r   urlbaser   pathfullurls          r   webview_file CHILDESCorpusReader.webview_file  s    * 	S=6)D99s?V+D66%d+Ddjjl*zz"I4PQRSdjjl*!BJJ/H$$OPQ$RR ==  9D}}V$$&=D##d*$#S)r   )r   )T)Nr   FFTF)Nr   FNTFr   )NCHIF)Nr   )__name__
__module____qualname____firstlineno____doc__r   r-   r3   r9   r>   rC   rA   rR   rQ   rh   rd   rk   r   r   r   r   r   __static_attributes__r   r   r   r   r      s    	 &MT (MX (MX (MTAG8
8.`Z-|
 N.*r   r   c                 ,   U (       d  SSK Jn  U" S5      n  [        U S5      nUR                  5       SS  GH8  nSnSnUR	                  U5      S   R                  5        H  u  pgUS:X  a  UnUS	:X  d  M  UnM     [        S
XES5        [        SUR                  U5      SS S5        [        SUR                  USS9SS S5        [        SUR                  U5      SS S5        [        SUR                  USS9SS S5        [        SUR                  USS9SS S5        [        SUR                  USS9SS S5        [        SUR                  USS9SS S5        [        SUR                  U5      SS S5        UR                  U5      S   R                  5        H,  u  pU	R                  5        H  u  pg[        SXS U5        M     M.     [        S![        UR                  U5      5      5        [        S"[        UR                  USS95      5        [        S#UR                  U5      5        [        S$UR                  USS%95        [        S&UR                  U5      5        [        5         GM;     g! [         a  n
[        S'5         Sn
A
gSn
A
ff = f)(zd
The CHILDES corpus should be manually downloaded and saved
to ``[NLTK_Data_Dir]/corpora/childes/``
r   )r   z!corpora/childes/data-xml/Eng-USA/z.*.xmlN   r   CorpusIdReadingz .....zwords:   z...zwords with replaced words:T)r!   z ...zwords with pos tags:zwords (only MOT):MOTr   zwords (only CHI):r   zstemmed words:)r$   z!words with relations and pos-tag:)r    z	sentence:rs   z	participantr   znum of sent:znum of morphemes:zage:zage in month:)re   zMLU:aU  The CHILDES corpus, or the parts you need, should be manually
        downloaded from https://childes.talkbank.org/data-xml/ and saved at
        [NLTK_Data_Dir]/corpora/childes/
            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
        demo('/path/to/childes/data-xml/Eng-USA/")
        )	nltk.datar   r   r   rC   rJ   r   r-   r3   r9   rR   r   rh   r   LookupError)corpus_rootr   childesfilerC   	corpus_idrM   rN   r`   valuesrn   s              r   demor   6  s   
 ">?.
%k8<OO%bq)DFI%nnT215;;=
(?"F$; %I	 >
 )V9(GMM$/3U;,dD1"15
 ('*>*>t*DRa*H&Q%w}}T5}'I"1'MuU%w}}T5}'I"1'MuU"GMM$TM$B2A$FO3dT22A6
 +w}}T22A6?'.';';D'A!'D'J'J'L#"(,,.JC/;S%H #1 (M .#gmmD&9":;%s7==D=+I'JK&'++d+,/7;;t4;#@A&'++d+,GC *F  
	
 	

s   AI4 5G>I4 4
J>JJ__main__r   )r   __docformat__rv   collectionsr   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   	nltk.utilr   r	   r
   r]   r   r   r   r   r   r   <module>r      sS     	 # * C 9 9 +W*/ W*x8
~ zF r   