
    Ofx                     Z    S r SSKrSSKrSSKJr  SSKJrJrJr  SSK	J
r
   " S S\5      rg)z{
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
    N)CorpusReader)StreamBackedCorpusViewZipFilePathPointerconcat)TweetTokenizerc                   \    \ rS rSrSr\r S\" 5       S4S jrSS jr	SS jr
SS jrS	 rS
rg)TwitterCorpusReader   a  
Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.

Individual Tweets can be tokenized using the default tokenizer, or by a
custom tokenizer specified as a parameter to the constructor.

Construct a new Tweet corpus reader for a set of documents
located at the given root directory.

If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as::

    from nltk.corpus import TwitterCorpusReader
    reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')

However, the recommended approach is to set the relevant directory as the
value of the environmental variable `TWITTER`, and then invoke the reader
as follows::

   root = os.environ['TWITTER']
   reader = TwitterCorpusReader(root, '.*\.json')

If you want to work directly with the raw Tweets, the `json` library can
be used::

   import json
   for tweet in reader.docs():
       print(json.dumps(tweet, indent=1, sort_keys=True))

Nutf8c                    [         R                  " XX$5        U R                  U R                  5       HL  n[	        U[
        5      (       a  M  [        R                  R                  U5      S:X  d  M?  [        SU S35      e    X0l
        g)z
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
    smaller units, including but not limited to words.
r   zFile z	 is emptyN)r   __init__abspaths_fileids
isinstancer   ospathgetsize
ValueError_word_tokenizer)selfrootfileidsword_tokenizerencodingr   s         </usr/lib/python3/dist-packages/nltk/corpus/reader/twitter.pyr   TwitterCorpusReader.__init__:   sq     	d'<MM$--0D$ 233&!+ 5i!899	 1
 	F-    c                     [        U R                  USS5       VVVs/ s H   u  p#nU R                  X R                  US9PM"     snnn5      $ s  snnnf )z
Returns the full Tweet objects, as specified by `Twitter
documentation on Tweets
<https://dev.twitter.com/docs/platform-objects/tweets>`_

:return: the given file(s) as a list of dictionaries deserialised
    from JSON.
:rtype: list(dict)
T)r   )r   r   
CorpusView_read_tweets)r   r   r   encfileids        r   docsTwitterCorpusReader.docsN   sX      ,0==$+M+M'T &7&7#F+M
 	
s   'Ac                     U R                  U5      n/ nU HJ  n US   n[        U[        5      (       a  UR                  U R                  5      nUR                  U5        ML     U$ ! [         a     M]  f = f)z{
Returns only the text content of Tweets in the file(s)

:return: the given file(s) as a list of Tweets.
:rtype: list(str)
text)r#   r   bytesdecoder   appendKeyError)r   r   
fulltweetstweetsjsonor&   s         r   stringsTwitterCorpusReader.strings_   su     YYw'
EV}dE**;;t}}5Dd#     s   AA&&
A43A4c                     U R                  U5      nU R                  nU Vs/ s H  oCR                  U5      PM     sn$ s  snf )z
:return: the given file(s) as a list of the text content of Tweets as
    as a list of words, screenanames, hashtags, URLs and punctuation symbols.

:rtype: list(list(str))
)r.   r   tokenize)r   r   r,   	tokenizerts        r   	tokenizedTwitterCorpusReader.tokenizedr   s>     g&((	/56v!""1%v666s   ?c                     / n[        S5       HE  nUR                  5       nU(       d  Us  $ [        R                  " U5      nUR	                  U5        MG     U$ )zC
Assumes that each line in ``stream`` is a JSON-serialised object.

   )rangereadlinejsonloadsr)   )r   streamr,   ilinetweets         r   r     TwitterCorpusReader._read_tweets}   sL     rA??$DJJt$EMM%   r   )r   )N)__name__
__module____qualname____firstlineno____doc__r   r   r   r   r#   r.   r4   r    __static_attributes__ r   r   r	   r	      s8    > (J
 !1AF.(
"&	7r   r	   )rE   r:   r   nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   nltk.tokenizer   r	   rG   r   r   <module>rK      s,   
  	 / V V (s, sr   