6.7 Single web page

6.7.1 Read_html

web_page <- rvest::read_html("https://www.tidyverse.org/packages/")
web_page

## {html_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n  <div id="appTidyverseSite" class="shrinkHeader alwaysShrinkHead ...

Because the downloaded file contains a unnecessary information. We process the data to extract only the text from the webpage.

6.7.2 Extract headline

header_web_page <- web_page %>%
  ## extract paragraphs
  rvest::html_nodes("h1") %>%
  ## extract text
  rvest::html_text() 
head(header_web_page)

## [1] "Tidyverse packages"

6.7.3 Extract text

web_page_txt <- web_page %>%
  ## extract paragraphs
  rvest::html_nodes("p") %>%
  ## extract text
  rvest::html_text()
head(web_page_txt)

## [1] "Install all the packages in the tidyverse by running install.packages(\"tidyverse\")."                                                                                                                                                              
## [2] "Run library(tidyverse) to load the core tidyverse and make it available\nin your current R session."                                                                                                                                                
## [3] "Learn more about the tidyverse package at https://tidyverse.tidyverse.org."                                                                                                                                                                         
## [4] "The core tidyverse includes the packages that you’re likely to use in everyday data analyses. As of tidyverse 1.3.0, the following packages are included in the core tidyverse:"                                                                    
## [5] "ggplot2 is a system for declaratively creating graphics, based on The Grammar of Graphics. You provide the data, tell ggplot2 how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details. Go to docs..."
## [6] "dplyr provides a grammar of data manipulation, providing a consistent set of verbs that solve the most common data manipulation challenges. Go to docs..."

6.7.4 Create a corpus

web_page_txt_corpus <- corpus(web_page_txt)
print(web_page_txt_corpus)

## Corpus consisting of 31 documents.
## text1 :
## "Install all the packages in the tidyverse by running install..."
## 
## text2 :
## "Run library(tidyverse) to load the core tidyverse and make i..."
## 
## text3 :
## "Learn more about the tidyverse package at https://tidyverse...."
## 
## text4 :
## "The core tidyverse includes the packages that you’re likely ..."
## 
## text5 :
## "ggplot2 is a system for declaratively creating graphics, bas..."
## 
## text6 :
## "dplyr provides a grammar of data manipulation, providing a c..."
## 
## [ reached max_ndoc ... 25 more documents ]

6.7.4.1 Summary

summary(web_page_txt_corpus, 10)

## Corpus consisting of 31 documents, showing 10 documents:
## 
##    Text Types Tokens Sentences
##   text1    13     16         1
##   text2    19     20         1
##   text3     9      9         1
##   text4    24     31         2
##   text5    37     50         3
##   text6    22     29         2
##   text7    30     46         3
##   text8    40     51         3
##   text9    45     57         3
##  text10    48     64         3

6.7.4.2 Accessing parts of corpus

web_page_txt_corpus[[4]]

## [1] "The core tidyverse includes the packages that you’re likely to use in everyday data analyses. As of tidyverse 1.3.0, the following packages are included in the core tidyverse:"

6.7.4.3 Document-level information

head(docvars(web_page_txt_corpus))

## data frame with 0 columns and 6 rows

6.7.5 Basic manipulations

By default, a corpus is created based on the “documents” (= lines). We can reshape it to show “sentences” or “paragraphs”.

6.7.5.1 Sentences

6.7.5.1.1 Transform

web_page_txt_corpus_sent <- corpus_reshape(web_page_txt_corpus, to = "sentences")
web_page_txt_corpus_sent

## Corpus consisting of 57 documents.
## text1.1 :
## "Install all the packages in the tidyverse by running install..."
## 
## text2.1 :
## "Run library(tidyverse) to load the core tidyverse and make i..."
## 
## text3.1 :
## "Learn more about the tidyverse package at https://tidyverse...."
## 
## text4.1 :
## "The core tidyverse includes the packages that you’re likely ..."
## 
## text4.2 :
## "As of tidyverse 1.3.0, the following packages are included i..."
## 
## text5.1 :
## "ggplot2 is a system for declaratively creating graphics, bas..."
## 
## [ reached max_ndoc ... 51 more documents ]

6.7.5.1.2 Summary

summary(web_page_txt_corpus_sent)

## Corpus consisting of 57 documents, showing 57 documents:
## 
##      Text Types Tokens Sentences
##   text1.1    13     16         1
##   text2.1    19     20         1
##   text3.1     9      9         1
##   text4.1    16     16         1
##   text4.2    13     15         1
##   text5.1    16     16         1
##   text5.2    23     28         1
##   text5.3     4      6         1
##   text6.1    19     23         1
##   text6.2     4      6         1
##   text7.1    14     14         1
##   text7.2    17     26         1
##   text7.3     4      6         1
##   text8.1    19     21         1
##   text8.2    23     24         1
##   text8.3     4      6         1
##   text9.1    24     25         1
##   text9.2    24     26         1
##   text9.3     4      6         1
##  text10.1    24     27         1
##  text10.2    26     31         1
##  text10.3     4      6         1
##  text11.1    17     18         1
##  text11.2    23     25         1
##  text11.3     4      6         1
##  text12.1    14     14         1
##  text12.2    19     20         1
##  text12.3     4      6         1
##  text13.1    20     21         1
##  text13.2     4      6         1
##  text14.1    12     12         1
##  text14.2    22     27         1
##  text15.1    20     23         1
##  text16.1     5      5         1
##  text16.2     7      7         1
##  text16.3    18     20         1
##  text16.4     5      5         1
##  text17.1     9     10         1
##  text18.1     5      5         1
##  text19.1     7      9         1
##  text20.1     9     10         1
##  text21.1     6      6         1
##  text22.1     5      5         1
##  text23.1     4      4         1
##  text23.2     7      7         1
##  text24.1     4      4         1
##  text25.1    33     39         1
##  text26.1    19     19         1
##  text27.1    30     31         1
##  text28.1    11     13         1
##  text28.2    26     29         1
##  text29.1    17     18         1
##  text30.1    19     21         1
##  text30.2    15     15         1
##  text30.3    17     19         1
##  text31.1    33     41         1
##  text31.2    21     23         1

6.7.5.1.3 Subset

We can subset sentences with 10 or more words

web_page_txt_corpus_sent_long <- corpus_subset(web_page_txt_corpus_sent, ntoken(web_page_txt_corpus_sent) >= 10)
ndoc(web_page_txt_corpus_sent_long)

## [1] 37

summary(web_page_txt_corpus_sent_long)

## Corpus consisting of 37 documents, showing 37 documents:
## 
##      Text Types Tokens Sentences
##   text1.1    13     16         1
##   text2.1    19     20         1
##   text4.1    16     16         1
##   text4.2    13     15         1
##   text5.1    16     16         1
##   text5.2    23     28         1
##   text6.1    19     23         1
##   text7.1    14     14         1
##   text7.2    17     26         1
##   text8.1    19     21         1
##   text8.2    23     24         1
##   text9.1    24     25         1
##   text9.2    24     26         1
##  text10.1    24     27         1
##  text10.2    26     31         1
##  text11.1    17     18         1
##  text11.2    23     25         1
##  text12.1    14     14         1
##  text12.2    19     20         1
##  text13.1    20     21         1
##  text14.1    12     12         1
##  text14.2    22     27         1
##  text15.1    20     23         1
##  text16.3    18     20         1
##  text17.1     9     10         1
##  text20.1     9     10         1
##  text25.1    33     39         1
##  text26.1    19     19         1
##  text27.1    30     31         1
##  text28.1    11     13         1
##  text28.2    26     29         1
##  text29.1    17     18         1
##  text30.1    19     21         1
##  text30.2    15     15         1
##  text30.3    17     19         1
##  text31.1    33     41         1
##  text31.2    21     23         1

6.7.5.2 Paragraphs

6.7.5.2.1 Transform

web_page_txt_corpus_para <- corpus_reshape(web_page_txt_corpus, to = "paragraphs")
web_page_txt_corpus_para

## Corpus consisting of 35 documents.
## text1.1 :
## "Install all the packages in the tidyverse by running install..."
## 
## text2.1 :
## "Run library(tidyverse) to load the core tidyverse and make i..."
## 
## text3.1 :
## "Learn more about the tidyverse package at https://tidyverse...."
## 
## text4.1 :
## "The core tidyverse includes the packages that you’re likely ..."
## 
## text5.1 :
## "ggplot2 is a system for declaratively creating graphics, bas..."
## 
## text6.1 :
## "dplyr provides a grammar of data manipulation, providing a c..."
## 
## [ reached max_ndoc ... 29 more documents ]

6.7.5.2.2 Summary

summary(web_page_txt_corpus_para)

## Corpus consisting of 35 documents, showing 35 documents:
## 
##      Text Types Tokens Sentences
##   text1.1    13     16         1
##   text2.1    19     20         1
##   text3.1     9      9         1
##   text4.1    24     31         2
##   text5.1    37     50         3
##   text6.1    22     29         2
##   text7.1    30     46         3
##   text8.1    40     51         3
##   text9.1    45     57         3
##  text10.1    48     64         3
##  text11.1    39     49         3
##  text12.1    30     40         3
##  text13.1    23     27         2
##  text14.1    31     39         2
##  text15.1    20     23         1
##  text16.1    21     23         3
##  text16.2     2      2         1
##  text16.3     2      2         1
##  text16.4     3      3         1
##  text16.5     6      7         2
##  text17.1     9     10         1
##  text18.1     5      5         1
##  text19.1     7      9         1
##  text20.1     9     10         1
##  text21.1     6      6         1
##  text22.1     5      5         1
##  text23.1    10     11         2
##  text24.1     4      4         1
##  text25.1    33     39         1
##  text26.1    19     19         1
##  text27.1    30     31         1
##  text28.1    34     42         2
##  text29.1    17     18         1
##  text30.1    44     55         3
##  text31.1    46     64         2

6.7.5.2.3 Subset

We can subset sentences with 10 or more words

web_page_txt_corpus_para_long <- corpus_subset(web_page_txt_corpus_para, ntoken(web_page_txt_corpus_para) >= 10)
ndoc(web_page_txt_corpus_para_long)

## [1] 25

summary(web_page_txt_corpus_para_long)

## Corpus consisting of 25 documents, showing 25 documents:
## 
##      Text Types Tokens Sentences
##   text1.1    13     16         1
##   text2.1    19     20         1
##   text4.1    24     31         2
##   text5.1    37     50         3
##   text6.1    22     29         2
##   text7.1    30     46         3
##   text8.1    40     51         3
##   text9.1    45     57         3
##  text10.1    48     64         3
##  text11.1    39     49         3
##  text12.1    30     40         3
##  text13.1    23     27         2
##  text14.1    31     39         2
##  text15.1    20     23         1
##  text16.1    21     23         3
##  text17.1     9     10         1
##  text20.1     9     10         1
##  text23.1    10     11         2
##  text25.1    33     39         1
##  text26.1    19     19         1
##  text27.1    30     31         1
##  text28.1    34     42         2
##  text29.1    17     18         1
##  text30.1    44     55         3
##  text31.1    46     64         2

6.7.5.3 Tokens

tokens() segments texts in a corpus into tokens (words or sentences) by word boundaries. We can remove punctuations or not

6.7.5.3.1 With punctuations

web_page_txt_corpus_tok <- tokens(web_page_txt_corpus)
web_page_txt_corpus_tok

## Tokens consisting of 31 documents.
## text1 :
##  [1] "Install"          "all"              "the"              "packages"        
##  [5] "in"               "the"              "tidyverse"        "by"              
##  [9] "running"          "install.packages" "("                "\""              
## [ ... and 4 more ]
## 
## text2 :
##  [1] "Run"       "library"   "("         "tidyverse" ")"         "to"       
##  [7] "load"      "the"       "core"      "tidyverse" "and"       "make"     
## [ ... and 8 more ]
## 
## text3 :
## [1] "Learn"                           "more"                           
## [3] "about"                           "the"                            
## [5] "tidyverse"                       "package"                        
## [7] "at"                              "https://tidyverse.tidyverse.org"
## [9] "."                              
## 
## text4 :
##  [1] "The"       "core"      "tidyverse" "includes"  "the"       "packages" 
##  [7] "that"      "you’re"    "likely"    "to"        "use"       "in"       
## [ ... and 19 more ]
## 
## text5 :
##  [1] "ggplot2"       "is"            "a"             "system"       
##  [5] "for"           "declaratively" "creating"      "graphics"     
##  [9] ","             "based"         "on"            "The"          
## [ ... and 38 more ]
## 
## text6 :
##  [1] "dplyr"        "provides"     "a"            "grammar"      "of"          
##  [6] "data"         "manipulation" ","            "providing"    "a"           
## [11] "consistent"   "set"         
## [ ... and 17 more ]
## 
## [ reached max_ndoc ... 25 more documents ]

6.7.5.3.2 Without punctuations

web_page_txt_corpus_tok_no_punct <- tokens(web_page_txt_corpus, remove_punct = TRUE)
web_page_txt_corpus_tok_no_punct

## Tokens consisting of 31 documents.
## text1 :
##  [1] "Install"          "all"              "the"              "packages"        
##  [5] "in"               "the"              "tidyverse"        "by"              
##  [9] "running"          "install.packages" "tidyverse"       
## 
## text2 :
##  [1] "Run"       "library"   "tidyverse" "to"        "load"      "the"      
##  [7] "core"      "tidyverse" "and"       "make"      "it"        "available"
## [ ... and 5 more ]
## 
## text3 :
## [1] "Learn"                           "more"                           
## [3] "about"                           "the"                            
## [5] "tidyverse"                       "package"                        
## [7] "at"                              "https://tidyverse.tidyverse.org"
## 
## text4 :
##  [1] "The"       "core"      "tidyverse" "includes"  "the"       "packages" 
##  [7] "that"      "you’re"    "likely"    "to"        "use"       "in"       
## [ ... and 16 more ]
## 
## text5 :
##  [1] "ggplot2"       "is"            "a"             "system"       
##  [5] "for"           "declaratively" "creating"      "graphics"     
##  [9] "based"         "on"            "The"           "Grammar"      
## [ ... and 29 more ]
## 
## text6 :
##  [1] "dplyr"        "provides"     "a"            "grammar"      "of"          
##  [6] "data"         "manipulation" "providing"    "a"            "consistent"  
## [11] "set"          "of"          
## [ ... and 12 more ]
## 
## [ reached max_ndoc ... 25 more documents ]

6.7.5.4 Types

We can also generate types on the tokenised corpus (without punctuations)

web_page_txt_corpus_tok_no_punct_types <- types(web_page_txt_corpus_tok_no_punct)
web_page_txt_corpus_tok_no_punct_types

##   [1] "Install"                         "all"                            
##   [3] "the"                             "packages"                       
##   [5] "in"                              "tidyverse"                      
##   [7] "by"                              "running"                        
##   [9] "install.packages"                "Run"                            
##  [11] "library"                         "to"                             
##  [13] "load"                            "core"                           
##  [15] "and"                             "make"                           
##  [17] "it"                              "available"                      
##  [19] "your"                            "current"                        
##  [21] "R"                               "session"                        
##  [23] "Learn"                           "more"                           
##  [25] "about"                           "package"                        
##  [27] "at"                              "https://tidyverse.tidyverse.org"
##  [29] "The"                             "includes"                       
##  [31] "that"                            "you’re"                         
##  [33] "likely"                          "use"                            
##  [35] "everyday"                        "data"                           
##  [37] "analyses"                        "As"                             
##  [39] "of"                              "1.3.0"                          
##  [41] "following"                       "are"                            
##  [43] "included"                        "ggplot2"                        
##  [45] "is"                              "a"                              
##  [47] "system"                          "for"                            
##  [49] "declaratively"                   "creating"                       
##  [51] "graphics"                        "based"                          
##  [53] "on"                              "Grammar"                        
##  [55] "Graphics"                        "You"                            
##  [57] "provide"                         "tell"                           
##  [59] "how"                             "map"                            
##  [61] "variables"                       "aesthetics"                     
##  [63] "what"                            "graphical"                      
##  [65] "primitives"                      "takes"                          
##  [67] "care"                            "details"                        
##  [69] "Go"                              "docs"                           
##  [71] "dplyr"                           "provides"                       
##  [73] "grammar"                         "manipulation"                   
##  [75] "providing"                       "consistent"                     
##  [77] "set"                             "verbs"                          
##  [79] "solve"                           "most"                           
##  [81] "common"                          "challenges"                     
##  [83] "tidyr"                           "functions"                      
##  [85] "help"                            "you"                            
##  [87] "get"                             "tidy"                           
##  [89] "Tidy"                            "with"                           
##  [91] "form"                            "brief"                          
##  [93] "every"                           "variable"                       
##  [95] "goes"                            "column"                         
##  [97] "readr"                           "fast"                           
##  [99] "friendly"                        "way"                            
## [101] "read"                            "rectangular"                    
## [103] "like"                            "csv"                            
## [105] "tsv"                             "fwf"                            
## [107] "It"                              "designed"                       
## [109] "flexibly"                        "parse"                          
## [111] "many"                            "types"                          
## [113] "found"                           "wild"                           
## [115] "while"                           "still"                          
## [117] "cleanly"                         "failing"                        
## [119] "when"                            "unexpectedly"                   
## [121] "changes"                         "purrr"                          
## [123] "enhances"                        "R’s"                            
## [125] "functional"                      "programming"                    
## [127] "FP"                              "toolkit"                        
## [129] "complete"                        "tools"                          
## [131] "working"                         "vectors"                        
## [133] "Once"                            "master"                         
## [135] "basic"                           "concepts"                       
## [137] "allows"                          "replace"                        
## [139] "loops"                           "code"                           
## [141] "easier"                          "write"                          
## [143] "expressive"                      "tibble"                         
## [145] "modern"                          "re-imagining"                   
## [147] "frame"                           "keeping"                        
## [149] "time"                            "has"                            
## [151] "proven"                          "be"                             
## [153] "effective"                       "throwing"                       
## [155] "out"                             "not"                            
## [157] "Tibbles"                         "data.frames"                    
## [159] "lazy"                            "surly"                          
## [161] "they"                            "do"                             
## [163] "less"                            "complain"                       
## [165] "forcing"                         "confront"                       
## [167] "problems"                        "earlier"                        
## [169] "typically"                       "leading"                        
## [171] "cleaner"                         "stringr"                        
## [173] "cohesive"                        "strings"                        
## [175] "as"                              "easy"                           
## [177] "possible"                        "built"                          
## [179] "top"                             "stringi"                        
## [181] "which"                           "uses"                           
## [183] "ICU"                             "C"                              
## [185] "correct"                         "implementations"                
## [187] "string"                          "manipulations"                  
## [189] "forcats"                         "suite"                          
## [191] "useful"                          "factors"                        
## [193] "handle"                          "categorical"                    
## [195] "have"                            "fixed"                          
## [197] "known"                           "values"                         
## [199] "lubridate"                       "date-times"                     
## [201] "extending"                       "improving"                      
## [203] "R's"                             "existing"                       
## [205] "support"                         "them"                           
## [207] "also"                            "other"                          
## [209] "specialised"                     "usage"                          
## [211] "They"                            "loaded"                         
## [213] "automatically"                   "so"                             
## [215] "you’ll"                          "need"                           
## [217] "each"                            "one"                            
## [219] "its"                             "own"                            
## [221] "call"                            "well"                           
## [223] "reading"                         "flat"                           
## [225] "files"                           "installs"                       
## [227] "number"                          "DBI"                            
## [229] "relational"                      "databases"                      
## [231] "Maintained"                      "Kirill"                         
## [233] "Müller"                          "You’ll"                         
## [235] "pair"                            "database"                       
## [237] "specific"                        "backends"                       
## [239] "RSQLite"                         "RMariaDB"                       
## [241] "RPostgres"                       "or"                             
## [243] "odbc"                            "https://db.rstudio.com"         
## [245] "haven"                           "SPSS"                           
## [247] "Stata"                           "SAS"                            
## [249] "httr"                            "web"                            
## [251] "APIs"                            "readxl"                         
## [253] "xls"                             "xlsx"                           
## [255] "sheets"                          "googlesheets4"                  
## [257] "Google"                          "Sheets"                         
## [259] "via"                             "API"                            
## [261] "v4"                              "googledrive"                    
## [263] "Drive"                           "rvest"                          
## [265] "scraping"                        "jsonlite"                       
## [267] "JSON"                            "Jeroen"                         
## [269] "Ooms"                            "xml2"                           
## [271] "XML"                             "In"                             
## [273] "addition"                        "three"                          
## [275] "specialized"                     "already"                        
## [277] "there"                           "two"                            
## [279] "others"                          "work"                           
## [281] "There"                           "allow"                          
## [283] "interface"                       "different"                      
## [285] "using"                           "same"                           
## [287] "syntax"                          "very"                           
## [289] "natural"                         "methods"                        
## [291] "iterating"                       "objects"                        
## [293] "additional"                      "general"                        
## [295] "magrittr"                        "pipe"                           
## [297] ">"                               "used"                           
## [299] "throughout"                      "piping"                         
## [301] "operators"                       "$"                              
## [303] "<"                               "can"                            
## [305] "places"                          "glue"                           
## [307] "an"                              "alternative"                    
## [309] "paste"                           "makes"                          
## [311] "combine"                         "Modeling"                       
## [313] "collection"                      "tidymodels"                     
## [315] "largely"                         "modelr"                         
## [317] "R4DS"                            "These"                          
## [319] "comprehensive"                   "foundation"                     
## [321] "models"                          "Visit"                          
## [323] "Getting"                         "Started"                        
## [325] "guide"                           "detailed"                       
## [327] "examples"                        "go"                             
## [329] "straight"                        "page"                           
## [331] "If"                              "asking"                         
## [333] "reporting"                       "bug"                            
## [335] "requesting"                      "new"                            
## [337] "feature"                         "succeed"                        
## [339] "if"                              "include"                        
## [341] "good"                            "reproducible"                   
## [343] "example"                         "precisely"                      
## [345] "reprex"                          "meant"                          
## [347] "learn"                           "along"                          
## [349] "tips"                            "section"

6.7.5.5 Keyword-in-contexts (kwic)

6.7.5.5.1 Pattern

You can see how keywords are used in the actual contexts in a concordance view produced by kwic(). Pattern is used to search for the pattern we are interested in (with * as a wildcard). Window used to display number of words/tokens around it.

web_page_txt_corpus_tok_no_punct_types_tidy <- kwic(web_page_txt_corpus_tok_no_punct, pattern =  "tidy*", window = 6)
head(web_page_txt_corpus_tok_no_punct_types_tidy, 10)

## Keyword-in-context with 10 matches.                                                                       
##   [text1, 7]              Install all the packages in the | tidyverse |
##  [text1, 11] in the tidyverse by running install.packages | tidyverse |
##   [text2, 3]                                  Run library | tidyverse |
##   [text2, 8]           library tidyverse to load the core | tidyverse |
##   [text3, 5]                         Learn more about the | tidyverse |
##   [text4, 3]                                     The core | tidyverse |
##  [text4, 18]              in everyday data analyses As of | tidyverse |
##  [text4, 28]            packages are included in the core | tidyverse |
##   [text7, 1]                                              |   tidyr   |
##  [text7, 12]               functions that help you get to |   tidy    |
##                                            
##  by running install.packages tidyverse     
##                                            
##  to load the core tidyverse and            
##  and make it available in your             
##  package at https://tidyverse.tidyverse.org
##  includes the packages that you’re likely  
##  1.3.0 the following packages are included 
##                                            
##  provides a set of functions that          
##  data Tidy data is data with

6.7.5.5.2 Phrase

web_page_txt_corpus_tok_no_punct_phrase <- kwic(web_page_txt_corpus_tok_no_punct, pattern =  phrase("the tidy*"), window = 6)
head(web_page_txt_corpus_tok_no_punct_phrase, 10)

## Keyword-in-context with 6 matches.                                                                     
##    [text1, 6:7]         Install all the packages in | the tidyverse |
##    [text3, 4:5]                    Learn more about | the tidyverse |
##   [text14, 1:2]                                     | The tidyverse |
##  [text15, 9:10]     as readr for reading flat files | the tidyverse |
##   [text28, 8:9] provides the pipe > used throughout | the tidyverse |
##   [text30, 3:4]                       Modeling with | the tidyverse |
##                                            
##  by running install.packages tidyverse     
##  package at https://tidyverse.tidyverse.org
##  also includes many other packages with    
##  package installs a number of other        
##  It also provide a number of               
##  uses the collection of tidymodels packages

6.7.5.6 stopwords

stopwords are function words (or grammatical words). We can search for these and remove them (if not necessary). This step is often useful because we are not interested in these stop_words.

tokens_remove() is an alias to tokens_select(selection = "remove")

web_page_txt_corpus_tok_no_punct_no_Stop <- tokens_select(web_page_txt_corpus_tok_no_punct, pattern = stopwords("en", source = "stopwords-iso"), selection = "remove")
web_page_txt_corpus_tok_no_punct_no_Stop

## Tokens consisting of 31 documents.
## text1 :
## [1] "Install"          "packages"         "tidyverse"        "running"         
## [5] "install.packages" "tidyverse"       
## 
## text2 :
## [1] "library"   "tidyverse" "load"      "core"      "tidyverse" "current"  
## [7] "session"  
## 
## text3 :
## [1] "Learn"                           "tidyverse"                      
## [3] "package"                         "https://tidyverse.tidyverse.org"
## 
## text4 :
##  [1] "core"      "tidyverse" "includes"  "packages"  "you’re"    "everyday" 
##  [7] "data"      "analyses"  "tidyverse" "1.3.0"     "packages"  "included" 
## [ ... and 2 more ]
## 
## text5 :
##  [1] "ggplot2"       "declaratively" "creating"      "graphics"     
##  [5] "based"         "Grammar"       "Graphics"      "provide"      
##  [9] "data"          "ggplot2"       "map"           "variables"    
## [ ... and 7 more ]
## 
## text6 :
##  [1] "dplyr"        "grammar"      "data"         "manipulation" "providing"   
##  [6] "consistent"   "set"          "verbs"        "solve"        "common"      
## [11] "data"         "manipulation"
## [ ... and 2 more ]
## 
## [ reached max_ndoc ... 25 more documents ]

6.7.5.7 wordstem

To be able to extract the stems of each of the given words, we use the function tokens_wordstem, char_wordstem or dfm_wordstem

6.7.5.7.1 `tokens_wordstem`

web_page_txt_corpus_tok_no_punct_no_Stop_stem <- tokens_wordstem(web_page_txt_corpus_tok_no_punct_no_Stop)
web_page_txt_corpus_tok_no_punct_no_Stop_stem

## Tokens consisting of 31 documents.
## text1 :
## [1] "Install"        "packag"         "tidyvers"       "run"           
## [5] "install.packag" "tidyvers"      
## 
## text2 :
## [1] "librari"  "tidyvers" "load"     "core"     "tidyvers" "current"  "session" 
## 
## text3 :
## [1] "Learn"                           "tidyvers"                       
## [3] "packag"                          "https://tidyverse.tidyverse.org"
## 
## text4 :
##  [1] "core"     "tidyvers" "includ"   "packag"   "you’r"    "everyday"
##  [7] "data"     "analys"   "tidyvers" "1.3.0"    "packag"   "includ"  
## [ ... and 2 more ]
## 
## text5 :
##  [1] "ggplot2" "declar"  "creat"   "graphic" "base"    "Grammar" "Graphic"
##  [8] "provid"  "data"    "ggplot2" "map"     "variabl"
## [ ... and 7 more ]
## 
## text6 :
##  [1] "dplyr"   "grammar" "data"    "manipul" "provid"  "consist" "set"    
##  [8] "verb"    "solv"    "common"  "data"    "manipul"
## [ ... and 2 more ]
## 
## [ reached max_ndoc ... 25 more documents ]

6.7.5.7.2 `dfm_wordstem`

Here we can use the dfm (for Document Feature Matrix) to obtain details of the wordstems used in each of the texts

web_page_txt_corpus_tok_no_punct_no_Stop_stem_dfm <- dfm(web_page_txt_corpus_tok_no_punct_no_Stop)
dfm_wordstem(web_page_txt_corpus_tok_no_punct_no_Stop_stem_dfm)

## Document-feature matrix of: 31 documents, 204 features (95.13% sparse) and 0 docvars.
##        features
## docs    instal packag tidyvers run install.packag librari load core current
##   text1      1      1        2   1              1       0    0    0       0
##   text2      0      0        2   0              0       1    1    1       1
##   text3      0      1        1   0              0       0    0    0       0
##   text4      0      2        3   0              0       0    0    2       0
##   text5      0      0        0   0              0       0    0    0       0
##   text6      0      0        0   0              0       0    0    0       0
##        features
## docs    session
##   text1       0
##   text2       1
##   text3       0
##   text4       0
##   text5       0
##   text6       0
## [ reached max_ndoc ... 25 more documents, reached max_nfeat ... 194 more features ]