7.6 Twitter data

We the twitter.json data accessed from here. This is a JSON file (.json) downloaded from the Twitter stream API.

7.6.1 Importing data

dat_twitter <- readtext("data/twitter.json", source = "twitter")

7.6.1.1 Create a corpus

dat_twitter_corpus <- corpus(dat_twitter)
print(dat_twitter_corpus)
## Corpus consisting of 7,504 documents and 42 docvars.
## twitter.json.1 :
## "@EFC_Jayy UKIP"
## 
## twitter.json.2 :
## "RT @Corbynator2:@jeremycorbyn Reaction from people at the Wa..."
## 
## twitter.json.3 :
## "RT @ryvr: Stephen Hawking, the world’s smartest man, backs J..."
## 
## twitter.json.4 :
## "RT @TheGreenParty: How you cast your vote will shape the fut..."
## 
## twitter.json.5 :
## "RT @UKLabour: #VoteLabour today for a fairer Britain. Find o..."
## 
## twitter.json.6 :
## "RT @WestmonsterUK: POLLS: FINAL DAY AVERAGE CON 43.8% LAB 35..."
## 
## [ reached max_ndoc ... 7,498 more documents ]

7.6.1.2 Summary

summary(dat_twitter_corpus, 10)
## Corpus consisting of 7504 documents, showing 10 documents:
## 
##             Text Types Tokens Sentences retweet_count favorite_count favorited
##   twitter.json.1     2      2         1             0              0     FALSE
##   twitter.json.2    24     30         3            90            108     FALSE
##   twitter.json.3    15     16         1            78            104     FALSE
##   twitter.json.4    20     23         3           244            218     FALSE
##   twitter.json.5    23     23         2          1896           2217     FALSE
##   twitter.json.6    18     22         1            55             52     FALSE
##   twitter.json.7    28     34         2            65             73     FALSE
##   twitter.json.8    23     23         1            30              9     FALSE
##   twitter.json.9    23     23         2          1896           2217     FALSE
##  twitter.json.10    26     27         2             0              0     FALSE
##  truncated             id_str in_reply_to_screen_name
##      FALSE 872596537142116352                EFC_Jayy
##      FALSE 872596536869363712                    <NA>
##      FALSE 872596537444093952                    <NA>
##      FALSE 872596538492637185                    <NA>
##      FALSE 872596538480103425                    <NA>
##      FALSE 872596538815651841                    <NA>
##      FALSE 872596537376989185                    <NA>
##      FALSE 872596538366849024                    <NA>
##      FALSE 872596538811457540                    <NA>
##       TRUE 872596537632804864                    <NA>
##                                                                                source
##    <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
##                    <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
##                    <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
##  <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
##    <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
##    <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
##  <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
##                    <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
##    <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
##  <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
##  retweeted                     created_at in_reply_to_status_id_str
##      FALSE Wed Jun 07 23:30:01 +0000 2017        872596176834572288
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##      FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##  in_reply_to_user_id_str lang listed_count verified
##               4556760676   en            1    FALSE
##                     <NA>   en           28    FALSE
##                     <NA>   en            2    FALSE
##                     <NA>   en            3    FALSE
##                     <NA>   en            6    FALSE
##                     <NA>   en            2    FALSE
##                     <NA>   en           12    FALSE
##                     <NA>   en           90    FALSE
##                     <NA>   en            1    FALSE
##                     <NA>   en           25    FALSE
##                        location        user_id_str
##                           Japan 863929468984995840
##                        Gondwana          153295243
##  LDN rt/mention/follow/link ≠ e          273731990
##                   East, England          477177095
##                          London          241664322
##  AKast UDI RN PRI Evopoli FKast 811768906301968384
##                       Liverpool         2157745426
##   Social Affairs Editor: Scisco          586255942
##                            <NA>         4551111388
##               Glasgow, Scotland          704227420
##                                                                                                                                                      description
##                                                                                                                                                             <NA>
##                                                                                         #Black. #Green. #Red. #Aboriginal. #Environmental. #Socialist. #Atheist.
##     Infovore, atheist, post-Ⓐ, p/t nihilist, lifelong radiophile, aspiring cultural terrorist 🏴  ☮️  🇵🇸  \nImages: @laborglitch/Airside\nM: openlyclassist@x0r.be
##                                                                               think outside your own perspective and find transcendence bypassing state opulence
##                                                                                                                                                             🇩🇿🇲🇦
##                                REPUBLICA LIBERTARIA NO ➕BUROCRACIA ESTATISTA MONOPOLISTA IMPRODUCTIVA 🇦🇷🇨🇱🇺🇾🇫🇷🇺🇸🇪🇸PROTOPÍA #STOPCastrituyente #PiñeraPresidente
##                               LFC. Lets get Labour back in power! Its not left vs right, its right vs wrong. Tories are morally and spiritually bankrupt. #JC4PM
##     Public interest issues, policy, human rights & social sciences. Researcher and writer for: Politics and Insights (home site), @Welfare_Weekly, @SciscoMedia.
##                                                                                                                                                             <NA>
##  artist. sick and tired of not being allowed an opinion in a so called free country! if I offend thee...you can fuck right off you CUNT! I thank you *takes bow*
##  geo_enabled                user_created_at statuses_count followers_count
##        FALSE Mon May 15 01:30:11 +0000 2017           2930             367
##        FALSE Tue Jun 08 05:05:23 +0000 2010          10223             845
##        FALSE Tue Mar 29 01:59:32 +0000 2011          20934             761
##        FALSE Sat Jan 28 22:41:07 +0000 2012          13603             321
##         TRUE Sat Jan 22 20:53:11 +0000 2011          13179             386
##        FALSE Thu Dec 22 03:02:43 +0000 2016          52384             792
##        FALSE Mon Oct 28 21:16:49 +0000 2013           8572             361
##        FALSE Mon May 21 04:46:43 +0000 2012          33759            4475
##         TRUE Sun Dec 20 23:27:00 +0000 2015            171              21
##         TRUE Tue Oct 08 00:09:09 +0000 2013          21568             837
##  favourites_count protected                             user_url
##              1260     FALSE                                 <NA>
##              9813     FALSE                                 <NA>
##             14733     FALSE                                 <NA>
##              5421     FALSE http://www.instagram.com/stuhornett/
##              5219     FALSE                                 <NA>
##                 6     FALSE                                 <NA>
##             11592     FALSE                                 <NA>
##             18368     FALSE   https://kittysjones.wordpress.com/
##               120     FALSE                                 <NA>
##             12177     FALSE                                 <NA>
##              name  time_zone user_lang utc_offset friends_count    screen_name
##          ジョージ       <NA>        en         NA           304       CoysJoji
##   Yara-ma-yha-who     London        en       3600           439      Unkle_Ken
##   Openly classist     London        en       3600          2761 OpenlyClassist
##               Stu Casablanca        en          0           767     StuHornett
##            Ousama     Dublin        en       3600           257      ousamaZ18
##     Capitán Tatán       <NA>        es         NA           353      omentando
##               Kev       <NA>        en         NA           357       YahMelia
##  RevolutionBreeze     London        en       3600          4999   suejonessays
##              Paul       <NA>     en-GB         NA            77    probinson63
##  Lady Jan(e) Gray  Edinburgh        en       3600          1263  jansengray_jo
##  country_code country place_type full_name place_name place_id place_lat
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##          <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##  place_lon lat lon
##        NaN  NA  NA
##        NaN  NA  NA
##        NaN  NA  NA
##        NaN  NA  NA
##        NaN  NA  NA
##        NaN  NA  NA
##        NaN  NA  NA
##        NaN  NA  NA
##        NaN  NA  NA
##        NaN  NA  NA
##                                                                                                                                         expanded_url
##                                                                                                                                                 <NA>
##                                                                                                                                                 <NA>
##  http://www.independent.co.uk/news/science/stephen-hawking-jeremy-corbyn-labour-theresa-may-conservatives-endorsement-general-election-a7774016.html
##                                                                                                                                                 <NA>
##                                                                                                                                http://bit.ly/2s0qDvb
##                                                                                                      http://www.westmonster.com/final-polls-verdict/
##                                                                                                                                                 <NA>
##                                                                                                                                                 <NA>
##                                                                                                                                http://bit.ly/2s0qDvb
##                                                                                                  https://twitter.com/i/web/status/872596537632804864
##                      url
##                     <NA>
##                         
##  https://t.co/2kl3ayLd44
##                     <NA>
##  https://t.co/953WYc2p0c
##  https://t.co/6zBHeHWuru
##                         
##                         
##  https://t.co/953WYc2p0c
##  https://t.co/qYdu0hUeey

7.6.1.3 Accessing parts of corpus

dat_twitter_corpus[,1]
## Corpus consisting of 7,504 documents and 42 docvars.
## twitter.json.1 :
## "@EFC_Jayy UKIP"
## 
## twitter.json.2 :
## "RT @Corbynator2:@jeremycorbyn Reaction from people at the Wa..."
## 
## twitter.json.3 :
## "RT @ryvr: Stephen Hawking, the world’s smartest man, backs J..."
## 
## twitter.json.4 :
## "RT @TheGreenParty: How you cast your vote will shape the fut..."
## 
## twitter.json.5 :
## "RT @UKLabour: #VoteLabour today for a fairer Britain. Find o..."
## 
## twitter.json.6 :
## "RT @WestmonsterUK: POLLS: FINAL DAY AVERAGE CON 43.8% LAB 35..."
## 
## [ reached max_ndoc ... 7,498 more documents ]

7.6.1.4 Document-level information

head(docvars(dat_twitter_corpus))
##   retweet_count favorite_count favorited truncated             id_str
## 1             0              0     FALSE     FALSE 872596537142116352
## 2            90            108     FALSE     FALSE 872596536869363712
## 3            78            104     FALSE     FALSE 872596537444093952
## 4           244            218     FALSE     FALSE 872596538492637185
## 5          1896           2217     FALSE     FALSE 872596538480103425
## 6            55             52     FALSE     FALSE 872596538815651841
##   in_reply_to_screen_name
## 1                EFC_Jayy
## 2                    <NA>
## 3                    <NA>
## 4                    <NA>
## 5                    <NA>
## 6                    <NA>
##                                                                                 source
## 1   <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
## 2                   <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
## 3                   <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
## 4 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## 5   <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
## 6   <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
##   retweeted                     created_at in_reply_to_status_id_str
## 1     FALSE Wed Jun 07 23:30:01 +0000 2017        872596176834572288
## 2     FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
## 3     FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
## 4     FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
## 5     FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
## 6     FALSE Wed Jun 07 23:30:01 +0000 2017                      <NA>
##   in_reply_to_user_id_str lang listed_count verified
## 1              4556760676   en            1    FALSE
## 2                    <NA>   en           28    FALSE
## 3                    <NA>   en            2    FALSE
## 4                    <NA>   en            3    FALSE
## 5                    <NA>   en            6    FALSE
## 6                    <NA>   en            2    FALSE
##                         location        user_id_str
## 1                          Japan 863929468984995840
## 2                       Gondwana          153295243
## 3 LDN rt/mention/follow/link ≠ e          273731990
## 4                  East, England          477177095
## 5                         London          241664322
## 6 AKast UDI RN PRI Evopoli FKast 811768906301968384
##                                                                                                                                                    description
## 1                                                                                                                                                         <NA>
## 2                                                                                     #Black. #Green. #Red. #Aboriginal. #Environmental. #Socialist. #Atheist.
## 3 Infovore, atheist, post-Ⓐ, p/t nihilist, lifelong radiophile, aspiring cultural terrorist 🏴  ☮️  🇵🇸  \nImages: @laborglitch/Airside\nM: openlyclassist@x0r.be
## 4                                                                           think outside your own perspective and find transcendence bypassing state opulence
## 5                                                                                                                                                         🇩🇿🇲🇦
## 6                            REPUBLICA LIBERTARIA NO ➕BUROCRACIA ESTATISTA MONOPOLISTA IMPRODUCTIVA 🇦🇷🇨🇱🇺🇾🇫🇷🇺🇸🇪🇸PROTOPÍA #STOPCastrituyente #PiñeraPresidente
##   geo_enabled                user_created_at statuses_count followers_count
## 1       FALSE Mon May 15 01:30:11 +0000 2017           2930             367
## 2       FALSE Tue Jun 08 05:05:23 +0000 2010          10223             845
## 3       FALSE Tue Mar 29 01:59:32 +0000 2011          20934             761
## 4       FALSE Sat Jan 28 22:41:07 +0000 2012          13603             321
## 5        TRUE Sat Jan 22 20:53:11 +0000 2011          13179             386
## 6       FALSE Thu Dec 22 03:02:43 +0000 2016          52384             792
##   favourites_count protected                             user_url
## 1             1260     FALSE                                 <NA>
## 2             9813     FALSE                                 <NA>
## 3            14733     FALSE                                 <NA>
## 4             5421     FALSE http://www.instagram.com/stuhornett/
## 5             5219     FALSE                                 <NA>
## 6                6     FALSE                                 <NA>
##              name  time_zone user_lang utc_offset friends_count    screen_name
## 1        ジョージ       <NA>        en         NA           304       CoysJoji
## 2 Yara-ma-yha-who     London        en       3600           439      Unkle_Ken
## 3 Openly classist     London        en       3600          2761 OpenlyClassist
## 4             Stu Casablanca        en          0           767     StuHornett
## 5          Ousama     Dublin        en       3600           257      ousamaZ18
## 6   Capitán Tatán       <NA>        es         NA           353      omentando
##   country_code country place_type full_name place_name place_id place_lat
## 1         <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
## 2         <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
## 3         <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
## 4         <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
## 5         <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
## 6         <NA>    <NA>         NA      <NA>       <NA>     <NA>       NaN
##   place_lon lat lon
## 1       NaN  NA  NA
## 2       NaN  NA  NA
## 3       NaN  NA  NA
## 4       NaN  NA  NA
## 5       NaN  NA  NA
## 6       NaN  NA  NA
##                                                                                                                                          expanded_url
## 1                                                                                                                                                <NA>
## 2                                                                                                                                                <NA>
## 3 http://www.independent.co.uk/news/science/stephen-hawking-jeremy-corbyn-labour-theresa-may-conservatives-endorsement-general-election-a7774016.html
## 4                                                                                                                                                <NA>
## 5                                                                                                                               http://bit.ly/2s0qDvb
## 6                                                                                                     http://www.westmonster.com/final-polls-verdict/
##                       url
## 1                    <NA>
## 2                        
## 3 https://t.co/2kl3ayLd44
## 4                    <NA>
## 5 https://t.co/953WYc2p0c
## 6 https://t.co/6zBHeHWuru

7.6.1.5 Unique variable names (for volume)

unique(docvars(dat_twitter_corpus, field = "lang"))
##  [1] "en"  "tl"  "und" "fr"  "ja"  "de"  "sv"  "hu"  "ht"  "lt"  "cy"  "ar" 
## [13] "pt"  "in"  "fi"  "tr"  "es"  "hi"  "fa"  "it"  "pl"

7.6.2 Advanced manipulations

7.6.2.1 Tokens

tokens() segments texts in a corpus into tokens (words or sentences) by word boundaries. We can remove punctuations or not

7.6.2.1.1 With punctuations
dat_twitter_corpus_tok <- tokens(dat_twitter_corpus)
dat_twitter_corpus_tok
## Tokens consisting of 7,504 documents and 42 docvars.
## twitter.json.1 :
## [1] "@EFC_Jayy" "UKIP"     
## 
## twitter.json.2 :
##  [1] "RT"            "@Corbynator2"  ":"             "@jeremycorbyn"
##  [5] "Reaction"      "from"          "people"        "at"           
##  [9] "the"           "Watford"       "Rally"         ":"            
## [ ... and 18 more ]
## 
## twitter.json.3 :
##  [1] "RT"       "@ryvr"    ":"        "Stephen"  "Hawking"  ","       
##  [7] "the"      "world’s"  "smartest" "man"      ","        "backs"   
## [ ... and 4 more ]
## 
## twitter.json.4 :
##  [1] "RT"             "@TheGreenParty" ":"              "How"           
##  [5] "you"            "cast"           "your"           "vote"          
##  [9] "will"           "shape"          "the"            "future"        
## [ ... and 11 more ]
## 
## twitter.json.5 :
##  [1] "RT"          "@UKLabour"   ":"           "#VoteLabour" "today"      
##  [6] "for"         "a"           "fairer"      "Britain"     "."          
## [11] "Find"        "out"        
## [ ... and 11 more ]
## 
## twitter.json.6 :
##  [1] "RT"             "@WestmonsterUK" ":"              "POLLS"         
##  [5] ":"              "FINAL"          "DAY"            "AVERAGE"       
##  [9] "CON"            "43.8"           "%"              "LAB"           
## [ ... and 10 more ]
## 
## [ reached max_ndoc ... 7,498 more documents ]
7.6.2.1.2 Without punctuations
dat_twitter_corpus_tok_no_punct <- tokens(dat_twitter_corpus, remove_punct = TRUE)
dat_twitter_corpus_tok_no_punct
## Tokens consisting of 7,504 documents and 42 docvars.
## twitter.json.1 :
## [1] "@EFC_Jayy" "UKIP"     
## 
## twitter.json.2 :
##  [1] "RT"            "@Corbynator2"  "@jeremycorbyn" "Reaction"     
##  [5] "from"          "people"        "at"            "the"          
##  [9] "Watford"       "Rally"         "We"            "believe"      
## [ ... and 10 more ]
## 
## twitter.json.3 :
##  [1] "RT"                      "@ryvr"                  
##  [3] "Stephen"                 "Hawking"                
##  [5] "the"                     "world’s"                
##  [7] "smartest"                "man"                    
##  [9] "backs"                   "Jeremy"                 
## [11] "Corbyn"                  "https://t.co/2kl3ayLd44"
## [ ... and 1 more ]
## 
## twitter.json.4 :
##  [1] "RT"             "@TheGreenParty" "How"            "you"           
##  [5] "cast"           "your"           "vote"           "will"          
##  [9] "shape"          "the"            "future"         "Every"         
## [ ... and 6 more ]
## 
## twitter.json.5 :
##  [1] "RT"          "@UKLabour"   "#VoteLabour" "today"       "for"        
##  [6] "a"           "fairer"      "Britain"     "Find"        "out"        
## [11] "where"       "to"         
## [ ... and 7 more ]
## 
## twitter.json.6 :
##  [1] "RT"             "@WestmonsterUK" "POLLS"          "FINAL"         
##  [5] "DAY"            "AVERAGE"        "CON"            "43.8"          
##  [9] "LAB"            "35.4"           "LD"             "8"             
## [ ... and 4 more ]
## 
## [ reached max_ndoc ... 7,498 more documents ]

7.6.2.2 Compound words

7.6.2.2.1 kwic Phrase
dat_twitter_corpus_tok_no_punct_phrase <- kwic(dat_twitter_corpus_tok_no_punct, pattern =  phrase("the tory"), window = 6)
head(dat_twitter_corpus_tok_no_punct_phrase, 10)
## Keyword-in-context with 10 matches.                                                                           
##   [twitter.json.112, 6:7] RT@SocialistVoice People are mocking | the Tory |
##   [twitter.json.131, 6:7] RT@SocialistVoice People are mocking | the Tory |
##  [twitter.json.155, 9:10]  Jeremy got an incredible welcome in | the Tory |
##   [twitter.json.465, 6:7] RT@SocialistVoice People are mocking | the Tory |
##  [twitter.json.1824, 6:7] RT@SocialistVoice People are mocking | the Tory |
##  [twitter.json.1826, 6:7] RT@SocialistVoice People are mocking | the Tory |
##  [twitter.json.1927, 6:7] RT@SocialistVoice People are mocking | the Tory |
##  [twitter.json.2301, 6:7] RT@SocialistVoice People are mocking | the Tory |
##  [twitter.json.2380, 6:7] RT@SocialistVoice People are mocking | the Tory |
##  [twitter.json.2514, 6:7] RT@SocialistVoice People are mocking | the Tory |
##                                                             
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
##  stronghold of Watford a little while                       
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
##  gutter press with hashtag#LastMinuteCorbynSmears#VoteLabour
7.6.2.2.2 Compounds
dat_twitter_corpus_tok_no_punct_comp <- tokens_compound(dat_twitter_corpus_tok_no_punct, pattern = phrase("the tory"))
dat_twitter_corpus_tok_no_punct_comp_kwic <- kwic(dat_twitter_corpus_tok_no_punct_comp, pattern = phrase("the_tory"))
head(dat_twitter_corpus_tok_no_punct_comp_kwic, 10)
## Keyword-in-context with 10 matches.                                                                         
##   [twitter.json.112, 6] RT@SocialistVoice People are mocking | the_Tory |
##   [twitter.json.131, 6] RT@SocialistVoice People are mocking | the_Tory |
##   [twitter.json.155, 9]         got an incredible welcome in | the_Tory |
##   [twitter.json.465, 6] RT@SocialistVoice People are mocking | the_Tory |
##  [twitter.json.1824, 6] RT@SocialistVoice People are mocking | the_Tory |
##  [twitter.json.1826, 6] RT@SocialistVoice People are mocking | the_Tory |
##  [twitter.json.1927, 6] RT@SocialistVoice People are mocking | the_Tory |
##  [twitter.json.2301, 6] RT@SocialistVoice People are mocking | the_Tory |
##  [twitter.json.2380, 6] RT@SocialistVoice People are mocking | the_Tory |
##  [twitter.json.2514, 6] RT@SocialistVoice People are mocking | the_Tory |
##                                                  
##  gutter press with hashtag#LastMinuteCorbynSmears
##  gutter press with hashtag#LastMinuteCorbynSmears
##  stronghold of Watford a little                  
##  gutter press with hashtag#LastMinuteCorbynSmears
##  gutter press with hashtag#LastMinuteCorbynSmears
##  gutter press with hashtag#LastMinuteCorbynSmears
##  gutter press with hashtag#LastMinuteCorbynSmears
##  gutter press with hashtag#LastMinuteCorbynSmears
##  gutter press with hashtag#LastMinuteCorbynSmears
##  gutter press with hashtag#LastMinuteCorbynSmears

7.6.2.3 N-grams

N-grams are a subfamily of compound words. They can be named as “bi-grams”, “tri-grams”, etc. N-grams yield a sequence of tokens from already tokenised text object.

7.6.2.3.1 Multi-grams

The code below allows to obtain the sequences of consecutive compound words, with 2, 3 or 4 compound words.

dat_twitter_corpus_tok_no_punct_ngram <- tokens_ngrams(dat_twitter_corpus_tok_no_punct, n = 2:4) %>% 
  unlist() %>%
  tolower() %>%
  table()
## Top 10 rows
head(dat_twitter_corpus_tok_no_punct_ngram, 10)
## .
##                                 ‼️_‼️                               ‼️_‼️_‼️ 
##                                  10                                   8 
##                             ‼️_‼️_‼️_‼️                       ‼️_‼️_‼️_#ge2017 
##                                   6                                   2 
##                         ‼️_‼️_#ge2017 ‼️_‼️_#ge2017_https://t.co/x5k8hydpka 
##                                   2                                   2 
##                           ‼️_#ge2017   ‼️_#ge2017_https://t.co/x5k8hydpka 
##                                   2                                   2 
##                                ‼️_do                            ‼️_do_not 
##                                   1                                   1
## Last 10 rows
tail(dat_twitter_corpus_tok_no_punct_ngram, 10)
## .
##   英国_の_総_選挙         選挙_保守      選挙_保守_党   選挙_保守_党_と 
##                 1                 1                 1                 1 
##           選挙_区        選挙_区_制 選挙_区_制_なので             開_け 
##                 1                 1                 1                 1 
##          開_け_て       開_け_て_み 
##                 1                 1
7.6.2.3.2 Skip-grams

Skip-grams allow to obtain non consecutive n-grams

dat_twitter_corpus_tok_no_punct_ngram_skip <- tokens_ngrams(dat_twitter_corpus_tok_no_punct, n = 2:4, skip = 1:2) %>% 
  unlist() %>%
  tolower() %>%
  table()
## Top 10 rows
head(dat_twitter_corpus_tok_no_punct_ngram_skip, 10)
## .
##                           ‼️_‼️                         ‼️_‼️_‼️ 
##                            14                             8 
##                 ‼️_‼️_‼️_#ge2017 ‼️_‼️_‼️_https://t.co/x5k8hydpka 
##                             2                             8 
##                   ‼️_‼️_#ge2017   ‼️_‼️_https://t.co/x5k8hydpka 
##                             8                             8 
##                     ‼️_#ge2017     ‼️_https://t.co/x5k8hydpka 
##                             4                             4 
##                         ‼️_not                  ‼️_not_pencil 
##                             1                             1
## Last 10 rows
tail(dat_twitter_corpus_tok_no_punct_ngram_skip, 10)
## .
##      開_て_ない 開_て_ない_最終 開_て_ない_結果           開_み        開_み_と 
##               1               1               1               1               1 
##     開_み_と_は   開_み_と_結果      開_み_最終   開_み_最終_は   開_み_最終_分 
##               1               1               1               1               1

7.6.2.4 Dictionary

If you have a dictionary with various words that fall within a generic word (e.g., variants of pronunciation of a word), then you can look these up. Here, we will create a dictionary that we populate ourselves and we show how to use it to search for items

7.6.2.4.1 Create dictionary
dict_dat_twitter <- dictionary(list(Tories = c("Tory*", "Conservative*"),
                        Labour = c("Socialist*", "Labour*")))
print(dict_dat_twitter)
## Dictionary object with 2 key entries.
## - [Tories]:
##   - tory*, conservative*
## - [Labour]:
##   - socialist*, labour*
7.6.2.4.2 Token lookup
dat_twitter_corpus_tok_no_punct_dict_toks <- tokens_lookup(dat_twitter_corpus_tok_no_punct, dictionary = dict_dat_twitter)
print(dat_twitter_corpus_tok_no_punct_dict_toks)
## Tokens consisting of 7,504 documents and 42 docvars.
## twitter.json.1 :
## character(0)
## 
## twitter.json.2 :
## character(0)
## 
## twitter.json.3 :
## character(0)
## 
## twitter.json.4 :
## character(0)
## 
## twitter.json.5 :
## character(0)
## 
## twitter.json.6 :
## character(0)
## 
## [ reached max_ndoc ... 7,498 more documents ]
7.6.2.4.3 DFM
dfm(dat_twitter_corpus_tok_no_punct_dict_toks)
## Document-feature matrix of: 7,504 documents, 2 features (85.65% sparse) and 42 docvars.
##                 features
## docs             tories labour
##   twitter.json.1      0      0
##   twitter.json.2      0      0
##   twitter.json.3      0      0
##   twitter.json.4      0      0
##   twitter.json.5      0      0
##   twitter.json.6      0      0
## [ reached max_ndoc ... 7,498 more documents ]

7.6.2.5 Part of Speech tagging

Part-of-Speech tagging (or PoS-Tagging) is used to distinguish different part of speech, e.g., the sentence: “Jane likes the girl” can be tagged as “Jane/NNP likes/VBZ the/DT girl/NN”, where NNP = proper noun (singular), VBZ = 3rd person singular present tense verb, DT = determiner, and NN = noun (singular or mass). We will use the udpipe package

7.6.2.5.1 Download and load language model

Before using the PoS-tagger, we need to download a language model. As you can see from typing ?udpipe_download_model, there are 65 languages trained on 101 treebanks from here

file_to_check <- "models/english-ewt-ud-2.5-191206.udpipe"

if (file.exists(file = file_to_check)){
  m_english <- udpipe_load_model(file = "models/english-ewt-ud-2.5-191206.udpipe")
}else{
  m_english <- udpipe_download_model(model_dir = "models/", language = "english-ewt")
  m_english <- udpipe_load_model(file = "models/english-ewt-ud-2.5-191206.udpipe")
}
7.6.2.5.2 Tokenise, tag, dependency parsing

We use the already tokenised text, with no punctuations.

dat_twitter_anndf <- udpipe_annotate(m_english, x = dat_twitter_corpus_tok_no_punct[[2]]) %>%
  as.data.frame() 
## inspect
head(dat_twitter_anndf, 10)
##    doc_id paragraph_id sentence_id      sentence token_id         token
## 1    doc1            1           1            RT        1            RT
## 2    doc2            1           1  @Corbynator2        1  @Corbynator2
## 3    doc3            1           1 @jeremycorbyn        1 @jeremycorbyn
## 4    doc4            1           1      Reaction        1      Reaction
## 5    doc5            1           1          from        1          from
## 6    doc6            1           1        people        1        people
## 7    doc7            1           1            at        1            at
## 8    doc8            1           1           the        1           the
## 9    doc9            1           1       Watford        1       Watford
## 10  doc10            1           1         Rally        1         Rally
##            lemma  upos xpos                     feats head_token_id dep_rel
## 1             Rt PROPN  NNP               Number=Sing             0    root
## 2   @corbynator2     X  ADD                      <NA>             0    root
## 3  @jeremycorbyn  NOUN   NN               Number=Sing             0    root
## 4       reaction  NOUN   NN               Number=Sing             0    root
## 5           from   ADP   IN                      <NA>             0    root
## 6         people  NOUN  NNS               Number=Plur             0    root
## 7             at   ADP   IN                      <NA>             0    root
## 8            the   DET   DT Definite=Def|PronType=Art             0    root
## 9        Watford PROPN  NNP               Number=Sing             0    root
## 10         rally   ADV   RB                      <NA>             0    root
##    deps            misc
## 1  <NA> SpacesAfter=\\n
## 2  <NA> SpacesAfter=\\n
## 3  <NA> SpacesAfter=\\n
## 4  <NA> SpacesAfter=\\n
## 5  <NA> SpacesAfter=\\n
## 6  <NA> SpacesAfter=\\n
## 7  <NA> SpacesAfter=\\n
## 8  <NA> SpacesAfter=\\n
## 9  <NA> SpacesAfter=\\n
## 10 <NA> SpacesAfter=\\n
7.6.2.5.3 Dependency parsing
## parse text
dat_twitter_corpus_sent <- udpipe_annotate(m_english, x = dat_twitter_corpus[[2]]) %>%
  as.data.frame()
## inspect
head(dat_twitter_corpus_sent)
##   doc_id paragraph_id sentence_id
## 1   doc1            1           1
## 2   doc1            1           1
## 3   doc1            1           2
## 4   doc1            1           2
## 5   doc1            1           2
## 6   doc1            1           2
##                                                                    sentence
## 1                                             RT @Corbynator2:@jeremycorbyn
## 2                                             RT @Corbynator2:@jeremycorbyn
## 3 Reaction from people at the Watford Rally: “We believe in Jeremy Corbyn!”
## 4 Reaction from people at the Watford Rally: “We believe in Jeremy Corbyn!”
## 5 Reaction from people at the Watford Rally: “We believe in Jeremy Corbyn!”
## 6 Reaction from people at the Watford Rally: “We believe in Jeremy Corbyn!”
##   token_id                      token                      lemma  upos xpos
## 1        1                         RT                         Rt PROPN  NNP
## 2        2 @Corbynator2:@jeremycorbyn @Corbynator2:@jeremycorbyn PROPN  NNP
## 3        1                   Reaction                   reaction  NOUN   NN
## 4        2                       from                       from   ADP   IN
## 5        3                     people                     people  NOUN  NNS
## 6        4                         at                         at   ADP   IN
##         feats head_token_id dep_rel deps misc
## 1 Number=Sing             0    root <NA> <NA>
## 2 Number=Sing             1    flat <NA> <NA>
## 3 Number=Sing             0    root <NA> <NA>
## 4        <NA>             3    case <NA> <NA>
## 5 Number=Plur             1    nmod <NA> <NA>
## 6        <NA>             7    case <NA> <NA>
dat_twitter_corpus_sent_short <- dat_twitter_corpus_sent[3:21,]
dat_twitter_corpus_sent_dplot <- textplot_dependencyparser(dat_twitter_corpus_sent_short, size = 3) 
## show plot
dat_twitter_corpus_sent_dplot

7.6.2.6 Feature co-occurrence matrix (FCM)

Feature co-occurrence matrix (FCM) records the number of co-occurrences of tokens

7.6.2.6.1 Computing number of co-occurrences
dat_twitter_corpus_dfmat <- dfm(dat_twitter_corpus_tok_no_punct)
dat_twitter_corpus_dfmat_trim <- dfm_trim(dat_twitter_corpus_dfmat, min_termfreq = 50)

topfeatures_dat_twitter_corpus <- topfeatures(dat_twitter_corpus_dfmat_trim)
topfeatures_dat_twitter_corpus
##      rt     the      to       a #ge2017     for    vote     and      of      in 
##    5827    3973    3418    2627    2483    2334    1886    1738    1738    1598
nfeat(dat_twitter_corpus_dfmat_trim)
## [1] 383
7.6.2.6.2 Features co-occurrences
dat_twitter_corpus_fcmat <- fcm(dat_twitter_corpus_dfmat_trim)
dat_twitter_corpus_fcmat
## Feature co-occurrence matrix of: 383 by 383 features.
##                features
## features        ukip  rt @corbynator2 @jeremycorbyn from people  at  the rally
##   ukip             4 194            0             0    6      2   8   86     0
##   rt               0 236          171           434  292    326 207 3165    95
##   @corbynator2     0   0            0           168    4      7  23   91    23
##   @jeremycorbyn    0   0            0             0   11     42  36  377    23
##   from             0   0            0             0    3     10  15  269    27
##   people           0   0            0             0    0     16  24  276     5
##   at               0   0            0             0    0      0  17  187    41
##   the              0   0            0             0    0      0   0 1311   104
##   rally            0   0            0             0    0      0   0    0    10
##   we               0   0            0             0    0      0   0    0     0
##                features
## features         we
##   ukip            9
##   rt            451
##   @corbynator2   11
##   @jeremycorbyn  26
##   from           26
##   people         37
##   at             25
##   the           268
##   rally           9
##   we            139
## [ reached max_nfeat ... 373 more features, reached max_nfeat ... 373 more features ]