7.8 Multiple webpages

7.8.1 Read_html

website <- "https://www.tidyverse.org/packages/" %>% 
  rvest::read_html()
website
## {html_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n  <div id="appTidyverseSite" class="shrinkHeader alwaysShrinkHead ...
a_elements <- website %>% 
  rvest::html_elements(css = "div.package > a")
a_elements
## {xml_nodeset (9)}
## [1] <a href="https://ggplot2.tidyverse.org/" target="_blank">\n    <img class ...
## [2] <a href="https://dplyr.tidyverse.org/" target="_blank">\n    <img class=" ...
## [3] <a href="https://tidyr.tidyverse.org/" target="_blank">\n    <img class=" ...
## [4] <a href="https://readr.tidyverse.org/" target="_blank">\n    <img class=" ...
## [5] <a href="https://purrr.tidyverse.org/" target="_blank">\n    <img class=" ...
## [6] <a href="https://tibble.tidyverse.org/" target="_blank">\n    <img class= ...
## [7] <a href="https://stringr.tidyverse.org/" target="_blank">\n    <img class ...
## [8] <a href="https://forcats.tidyverse.org/" target="_blank">\n    <img class ...
## [9] <a href="https://lubridate.tidyverse.org/" target="_blank">\n    <img cla ...

7.8.2 Extract headline

links <- a_elements %>%
  rvest::html_attr(name = "href")
links
## [1] "https://ggplot2.tidyverse.org/"   "https://dplyr.tidyverse.org/"    
## [3] "https://tidyr.tidyverse.org/"     "https://readr.tidyverse.org/"    
## [5] "https://purrr.tidyverse.org/"     "https://tibble.tidyverse.org/"   
## [7] "https://stringr.tidyverse.org/"   "https://forcats.tidyverse.org/"  
## [9] "https://lubridate.tidyverse.org/"

7.8.3 Extract subpages

pages <- links %>% 
  map(rvest::read_html)
pages
## [[1]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[2]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[3]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[4]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[5]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[6]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[7]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[8]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[9]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...

The structure seems to be similar across all pages

pages %>% 
  map(rvest::html_element, css = "a.navbar-brand") %>% 
  map_chr(rvest::html_text)
## [1] "ggplot2"   "dplyr"     "tidyr"     "readr"     "purrr"     "tibble"   
## [7] "stringr"   "forcats"   "lubridate"

and extracting version number

pages %>% 
  map(rvest::html_element, css = "small.nav-text.text-muted.me-auto") %>% 
  map_chr(rvest::html_text)
## [1] "3.5.2" "1.1.4" "1.3.1" "2.1.5" "1.1.0" "3.3.0" "1.5.1" "1.0.0" "1.9.4"

and we can also add all into a tibble

7.8.4 Extract text

pages_table <- tibble(
  name = pages %>% 
    map(rvest::html_element, css = "a.navbar-brand") %>% 
    map_chr(rvest::html_text),
  version = pages %>% 
    map(rvest::html_element, css = "small.nav-text.text-muted.me-auto") %>% 
    map_chr(rvest::html_text),
  CRAN = pages %>% 
    map(rvest::html_element, css = "ul.list-unstyled > li:nth-child(1) > a") %>% 
    map_chr(rvest::html_attr, name = "href"),
  Learn = pages %>% 
    map(rvest::html_element, css = "ul.list-unstyled > li:nth-child(4) > a") %>% 
    map_chr(rvest::html_attr, name = "href"), 
  text = pages %>%
    map(rvest::html_element,  css = "body") %>%
    map_chr(rvest::html_text2)
)
pages_table
## # A tibble: 9 × 5
##   name      version CRAN                                          Learn    text 
##   <chr>     <chr>   <chr>                                         <chr>    <chr>
## 1 ggplot2   3.5.2   https://cloud.r-project.org/package=ggplot2   https:/… "Ski…
## 2 dplyr     1.1.4   https://cloud.r-project.org/package=dplyr     http://… "Ski…
## 3 tidyr     1.3.1   https://cloud.r-project.org/package=tidyr     https:/… "Ski…
## 4 readr     2.1.5   https://cloud.r-project.org/package=readr     http://… "Ski…
## 5 purrr     1.1.0   https://cloud.r-project.org/package=purrr     http://… "Ski…
## 6 tibble    3.3.0   https://cloud.r-project.org/package=tibble    https:/… "Ski…
## 7 stringr   1.5.1   https://cloud.r-project.org/package=stringr   http://… "Ski…
## 8 forcats   1.0.0   https://cloud.r-project.org/package=forcats   http://… "Ski…
## 9 lubridate 1.9.4   https://cloud.r-project.org/package=lubridate https:/… "Ski…

7.8.5 Create a corpus

web_pages_txt_corpus <- corpus(pages_table)
print(web_pages_txt_corpus)
## Corpus consisting of 9 documents and 4 docvars.
## text1 :
## "Skip to content ggplot23.5.2 Get started Reference News Rele..."
## 
## text2 :
## "Skip to content dplyr1.1.4 Get started Reference Articles Gr..."
## 
## text3 :
## "Skip to content tidyr1.3.1 Tidy data Reference Articles Pivo..."
## 
## text4 :
## "Skip to content readr2.1.5 Get started Reference Articles Co..."
## 
## text5 :
## "Skip to content purrr1.1.0 Reference Articles purrr <-> base..."
## 
## text6 :
## "Skip to content tibble3.3.0 Get started Reference Articles C..."
## 
## [ reached max_ndoc ... 3 more documents ]

7.8.5.1 Summary

summary(web_pages_txt_corpus, 10)
## Corpus consisting of 9 documents, showing 9 documents:
## 
##   Text Types Tokens Sentences      name version
##  text1   368    777        24   ggplot2   3.5.2
##  text2   419   1258        17     dplyr   1.1.4
##  text3   326    729        25     tidyr   1.3.1
##  text4   571   1745        47     readr   2.1.5
##  text5   248    495        11     purrr   1.1.0
##  text6   269    717        14    tibble   3.3.0
##  text7   398   1345        23   stringr   1.5.1
##  text8   264    648        14   forcats   1.0.0
##  text9   267    650        11 lubridate   1.9.4
##                                           CRAN
##    https://cloud.r-project.org/package=ggplot2
##      https://cloud.r-project.org/package=dplyr
##      https://cloud.r-project.org/package=tidyr
##      https://cloud.r-project.org/package=readr
##      https://cloud.r-project.org/package=purrr
##     https://cloud.r-project.org/package=tibble
##    https://cloud.r-project.org/package=stringr
##    https://cloud.r-project.org/package=forcats
##  https://cloud.r-project.org/package=lubridate
##                                           Learn
##  https://r4ds.had.co.nz/data-visualisation.html
##            http://r4ds.had.co.nz/transform.html
##                https://r4ds.hadley.nz/data-tidy
##          http://r4ds.had.co.nz/data-import.html
##            http://r4ds.had.co.nz/iteration.html
##             https://r4ds.had.co.nz/tibbles.html
##              http://r4ds.hadley.nz/strings.html
##              http://r4ds.had.co.nz/factors.html
##           https://r4ds.hadley.nz/datetimes.html

7.8.5.2 Accessing parts of corpus

web_pages_txt_corpus[[4]]
## [1] "Skip to content\nreadr2.1.5\nGet started\nReference\nArticles\nColumn type Locales\nNews\nReleases\nVersion 2.1.0Version 2.0.0Version 1.4.0Version 1.3.1Version 1.0.0Version 0.2.0Version 0.1.0\nChangelog\nreadr\nOverview\n\nThe goal of readr is to provide a fast and friendly way to read rectangular data from delimited files, such as comma-separated values (CSV) and tab-separated values (TSV). It is designed to parse many types of data found in the wild, while providing an informative problem report when parsing leads to unexpected results. If you are new to readr, the best place to start is the data import chapter in R for Data Science.\n\nInstallation\n\n# The easiest way to get readr is to install the whole tidyverse:\ninstall.packages(\"tidyverse\")\n\n# Alternatively, install just readr:\ninstall.packages(\"readr\")\nCheatsheet\n\nUsage\n\nreadr is part of the core tidyverse, so you can load it with:\n\n\nlibrary(tidyverse)\n#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──\n#> ✔ dplyr     1.1.4          ✔ readr     2.1.4.9000\n#> ✔ forcats   1.0.0          ✔ stringr   1.5.1     \n#> ✔ ggplot2   3.4.3          ✔ tibble    3.2.1     \n#> ✔ lubridate 1.9.3          ✔ tidyr     1.3.0     \n#> ✔ purrr     1.0.2          \n#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n#> ✖ dplyr::filter() masks stats::filter()\n#> ✖ dplyr::lag()    masks stats::lag()\n#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors\n\nOf course, you can also load readr as an individual package:\n\n\nlibrary(readr)\n\nTo read a rectangular dataset with readr, you combine two pieces: a function that parses the lines of the file into individual fields and a column specification.\n\nreadr supports the following file formats with these read_*() functions:\n\nread_csv(): comma-separated values (CSV)\nread_tsv(): tab-separated values (TSV)\nread_csv2(): semicolon-separated values with , as the decimal mark\nread_delim(): delimited files (CSV and TSV are important special cases)\nread_fwf(): fixed-width files\nread_table(): whitespace-separated files\nread_log(): web log files\n\nA column specification describes how each column should be converted from a character vector to a specific data type (e.g. character, numeric, datetime, etc.). In the absence of a column specification, readr will guess column types from the data. vignette(\"column-types\") gives more detail on how readr guesses the column types. Column type guessing is very handy, especially during data exploration, but it’s important to remember these are just guesses. As any data analysis project matures past the exploratory phase, the best strategy is to provide explicit column types.\n\nThe following example loads a sample file bundled with readr and guesses the column types:\n\n\n(chickens <- read_csv(readr_example(\"chickens.csv\")))\n#> Rows: 5 Columns: 4\n#> ── Column specification ────────────────────────────────────────────────────────\n#> Delimiter: \",\"\n#> chr (3): chicken, sex, motto\n#> dbl (1): eggs_laid\n#> \n#> ℹ Use `spec()` to retrieve the full column specification for this data.\n#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.\n#> # A tibble: 5 × 4\n#>   chicken                 sex     eggs_laid motto                               \n#>   <chr>                   <chr>       <dbl> <chr>                               \n#> 1 Foghorn Leghorn         rooster         0 That's a joke, ah say, that's a jok…\n#> 2 Chicken Little          hen             3 The sky is falling!                 \n#> 3 Ginger                  hen            12 Listen. We'll either die free chick…\n#> 4 Camilla the Chicken     hen             7 Bawk, buck, ba-gawk.                \n#> 5 Ernie The Giant Chicken rooster         0 Put Captain Solo in the cargo hold.\n\nNote that readr prints the column types – the guessed column types, in this case. This is useful because it allows you to check that the columns have been read in as you expect. If they haven’t, that means you need to provide the column specification. This sounds like a lot of trouble, but luckily readr affords a nice workflow for this. Use spec() to retrieve the (guessed) column specification from your initial effort.\n\n\nspec(chickens)\n#> cols(\n#>   chicken = col_character(),\n#>   sex = col_character(),\n#>   eggs_laid = col_double(),\n#>   motto = col_character()\n#> )\n\nNow you can copy, paste, and tweak this, to create a more explicit readr call that expresses the desired column types. Here we express that sex should be a factor with levels rooster and hen, in that order, and that eggs_laid should be integer.\n\n\nchickens <- read_csv(\n  readr_example(\"chickens.csv\"),\n  col_types = cols(\n    chicken   = col_character(),\n    sex       = col_factor(levels = c(\"rooster\", \"hen\")),\n    eggs_laid = col_integer(),\n    motto     = col_character()\n  )\n)\nchickens\n#> # A tibble: 5 × 4\n#>   chicken                 sex     eggs_laid motto                               \n#>   <chr>                   <fct>       <int> <chr>                               \n#> 1 Foghorn Leghorn         rooster         0 That's a joke, ah say, that's a jok…\n#> 2 Chicken Little          hen             3 The sky is falling!                 \n#> 3 Ginger                  hen            12 Listen. We'll either die free chick…\n#> 4 Camilla the Chicken     hen             7 Bawk, buck, ba-gawk.                \n#> 5 Ernie The Giant Chicken rooster         0 Put Captain Solo in the cargo hold.\n\nvignette(\"readr\") gives an expanded introduction to readr.\n\nEditions\n\nreadr got a new parsing engine in version 2.0.0 (released July 2021). In this so-called second edition, readr calls vroom::vroom(), by default.\n\nThe parsing engine in readr versions prior to 2.0.0 is now called the first edition. If you’re using readr >= 2.0.0, you can still access first edition parsing via the functions with_edition(1, ...) and local_edition(1). And, obviously, if you’re using readr < 2.0.0, you will get first edition parsing, by definition, because that’s all there is.\n\nWe will continue to support the first edition for a number of releases, but the overall goal is to make the second edition uniformly better than the first. Therefore the plan is to eventually deprecate and then remove the first edition code. New code and actively-maintained code should use the second edition. The workarounds with_edition(1, ...) and local_edition(1) are offered as a pragmatic way to patch up legacy code or as a temporary solution for infelicities identified as the second edition matures.\n\nAlternatives\n\nThere are two main alternatives to readr: base R and data.table’s fread(). The most important differences are discussed below.\n\nBase R\n\nCompared to the corresponding base functions, readr functions:\n\nUse a consistent naming scheme for the parameters (e.g. col_names and col_types not header and colClasses).\n\nAre generally much faster (up to 10x-100x) depending on the dataset.\n\nLeave strings as is by default, and automatically parse common date/time formats.\n\nHave a helpful progress bar if loading is going to take a while.\n\nAll functions work exactly the same way regardless of the current locale. To override the US-centric defaults, use locale().\n\ndata.table and fread()\n\ndata.table has a function similar to read_csv() called fread(). Compared to fread(), readr functions:\n\nAre sometimes slower, particularly on numeric heavy data.\n\nCan automatically guess some parameters, but basically encourage explicit specification of, e.g., the delimiter, skipped rows, and the header row.\n\nFollow tidyverse-wide conventions, such as returning a tibble, a standard approach for column name repair, and a common mini-language for column selection.\n\nAcknowledgements\n\nThanks to:\n\nJoe Cheng for showing me the beauty of deterministic finite automata for parsing, and for teaching me why I should write a tokenizer.\n\nJJ Allaire for helping me come up with a design that makes very few copies, and is easy to extend.\n\nDirk Eddelbuettel for coming up with the name!\n\nLinks\nView on CRAN\nBrowse source code\nReport a bug\nLearn more\nLicense\nFull license\nMIT + file LICENSE\nCommunity\nContributing guide\nCode of conduct\nGetting help\nCitation\nCiting readr\nDevelopers\nHadley Wickham\nAuthor\nJim Hester\nAuthor\nJennifer Bryan\nAuthor, maintainer\n\nCopyright holder, funder\nMore about authors...\n\nDeveloped by Hadley Wickham, Jim Hester, Jennifer Bryan, .\n\nSite built with pkgdown 2.0.7."

7.8.5.3 Document-level information

head(docvars(web_pages_txt_corpus))
##      name version                                        CRAN
## 1 ggplot2   3.5.2 https://cloud.r-project.org/package=ggplot2
## 2   dplyr   1.1.4   https://cloud.r-project.org/package=dplyr
## 3   tidyr   1.3.1   https://cloud.r-project.org/package=tidyr
## 4   readr   2.1.5   https://cloud.r-project.org/package=readr
## 5   purrr   1.1.0   https://cloud.r-project.org/package=purrr
## 6  tibble   3.3.0  https://cloud.r-project.org/package=tibble
##                                            Learn
## 1 https://r4ds.had.co.nz/data-visualisation.html
## 2           http://r4ds.had.co.nz/transform.html
## 3               https://r4ds.hadley.nz/data-tidy
## 4         http://r4ds.had.co.nz/data-import.html
## 5           http://r4ds.had.co.nz/iteration.html
## 6            https://r4ds.had.co.nz/tibbles.html

7.8.6 Advanced manipulations

7.8.6.1 Tokens

tokens() segments texts in a corpus into tokens (words or sentences) by word boundaries. We can remove punctuations or not

7.8.6.1.1 With punctuations
web_pages_txt_corpus_tok <- tokens(web_pages_txt_corpus)
web_pages_txt_corpus_tok
## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "to"           "content"      "ggplot23.5.2" "Get"         
##  [6] "started"      "Reference"    "News"         "Releases"     "Version"     
## [11] "3.5.0"        "Version"     
## [ ... and 765 more ]
## 
## text2 :
##  [1] "Skip"       "to"         "content"    "dplyr1.1.4" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Grouped"    "data"      
## [11] "Two-table"  "verbs"     
## [ ... and 1,246 more ]
## 
## text3 :
##  [1] "Skip"        "to"          "content"     "tidyr1.3.1"  "Tidy"       
##  [6] "data"        "Reference"   "Articles"    "Pivoting"    "Rectangling"
## [11] "Nested"      "data"       
## [ ... and 717 more ]
## 
## text4 :
##  [1] "Skip"       "to"         "content"    "readr2.1.5" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Column"     "type"      
## [11] "Locales"    "News"      
## [ ... and 1,733 more ]
## 
## text5 :
##  [1] "Skip"       "to"         "content"    "purrr1.1.0" "Reference" 
##  [6] "Articles"   "purrr"      "<"          "-"          ">"         
## [11] "base"       "R"         
## [ ... and 483 more ]
## 
## text6 :
##  [1] "Skip"        "to"          "content"     "tibble3.3.0" "Get"        
##  [6] "started"     "Reference"   "Articles"    "Column"      "types"      
## [11] "Controlling" "display"    
## [ ... and 705 more ]
## 
## [ reached max_ndoc ... 3 more documents ]
7.8.6.1.2 Without punctuations
web_pages_txt_corpus_tok_no_punct <- tokens(web_pages_txt_corpus, remove_punct = TRUE)
web_pages_txt_corpus_tok_no_punct
## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "to"           "content"      "ggplot23.5.2" "Get"         
##  [6] "started"      "Reference"    "News"         "Releases"     "Version"     
## [11] "3.5.0"        "Version"     
## [ ... and 635 more ]
## 
## text2 :
##  [1] "Skip"       "to"         "content"    "dplyr1.1.4" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Grouped"    "data"      
## [11] "Two-table"  "verbs"     
## [ ... and 988 more ]
## 
## text3 :
##  [1] "Skip"        "to"          "content"     "tidyr1.3.1"  "Tidy"       
##  [6] "data"        "Reference"   "Articles"    "Pivoting"    "Rectangling"
## [11] "Nested"      "data"       
## [ ... and 547 more ]
## 
## text4 :
##  [1] "Skip"       "to"         "content"    "readr2.1.5" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Column"     "type"      
## [11] "Locales"    "News"      
## [ ... and 1,364 more ]
## 
## text5 :
##  [1] "Skip"       "to"         "content"    "purrr1.1.0" "Reference" 
##  [6] "Articles"   "purrr"      "<"          ">"          "base"      
## [11] "R"          "Functional"
## [ ... and 376 more ]
## 
## text6 :
##  [1] "Skip"        "to"          "content"     "tibble3.3.0" "Get"        
##  [6] "started"     "Reference"   "Articles"    "Column"      "types"      
## [11] "Controlling" "display"    
## [ ... and 536 more ]
## 
## [ reached max_ndoc ... 3 more documents ]

7.8.6.2 Compound words

7.8.6.2.1 kwic Phrase
web_pages_txt_corpus_tok_no_punct_phrase <- kwic(web_pages_txt_corpus_tok_no_punct, pattern =  phrase("the tidy*"), window = 6)
head(web_pages_txt_corpus_tok_no_punct_phrase, 10)
## Keyword-in-context with 3 matches.                                                                     
##  [text1, 348:349]          give you a comprehensive introduction to |
##  [text3, 109:110]         that is used wherever possible throughout |
##  [text3, 477:478] please use community.rstudio.com Please note that |
##                                                        
##  the tidyverse | and these two chapters will get       
##  the tidyverse | If you ensure that your data          
##    the tidyr   | project is released with a Contributor
7.8.6.2.2 Compounds
web_pages_txt_corpus_tok_no_punct_comp <- tokens_compound(web_pages_txt_corpus_tok_no_punct, pattern = phrase("the tidy*"))
web_pages_txt_corpus_tok_no_punct_comp_kwic <- kwic(web_pages_txt_corpus_tok_no_punct_comp, pattern = phrase("the_tidy*"))
head(web_pages_txt_corpus_tok_no_punct_comp_kwic, 10)
## Keyword-in-context with 3 matches.                                                                          
##  [text1, 348]        you a comprehensive introduction to | the_tidyverse |
##  [text3, 109]       is used wherever possible throughout | the_tidyverse |
##  [text3, 476] use community.rstudio.com Please note that |   the_tidyr   |
##                             
##  and these two chapters will
##  If you ensure that your    
##  project is released with a

7.8.6.3 N-grams

N-grams are a subfamily of compound words. They can be named as “bi-grams”, “tri-grams”, etc. N-grams yield a sequence of tokens from already tokenised text object.

7.8.6.3.1 Multi-grams

The code below allows to obtain the sequences of consecutive compound words, with 2, 3 or 4 compound words.

web_pages_txt_corpus_tok_no_punct_ngram <- tokens_ngrams(web_pages_txt_corpus_tok_no_punct, n = 2:4) %>% 
  unlist() %>%
  tolower() %>%
  table()
## Top 10 rows
head(web_pages_txt_corpus_tok_no_punct_ngram, 10)
## .
##              $_cyl            $_cyl_|          $_cyl_|_>                ^_2 
##                  1                  1                  1                  2 
##              ^_2_+            ^_2_+_y              ^_2_>       ^_2_>_select 
##                  1                  1                  1                  1 
##   `_show_col_types `_show_col_types_= 
##                  1                  1
## Last 10 rows
tail(web_pages_txt_corpus_tok_no_punct_ngram, 10)
## .
##   zero_length_outputs_learn                  zones_leap 
##                           1                           1 
##             zones_leap_days    zones_leap_days_daylight 
##                           1                           1 
##               zones_with_tz      zones_with_tz_force_tz 
##                           1                           1 
## zones_with_tz_force_tz_time                  🧩_license 
##                           1                           1 
##             🧩_license_full     🧩_license_full_license 
##                           1                           1
7.8.6.3.2 Skip-grams

Skip-grams allow to obtain non consecutive n-grams

web_pages_txt_corpus_tok_no_punct_ngram_skip <- tokens_ngrams(web_pages_txt_corpus_tok_no_punct, n = 2:4, skip = 1:2)
## Top 10 rows
head(web_pages_txt_corpus_tok_no_punct_ngram_skip[[7]], 10)
##  [1] "Skip_content"           "Skip_stringr1.5.1"      "to_stringr1.5.1"       
##  [4] "to_Get"                 "content_Get"            "content_started"       
##  [7] "stringr1.5.1_started"   "stringr1.5.1_Reference" "Get_Reference"         
## [10] "Get_Articles"
## Last 10 rows
tail(web_pages_txt_corpus_tok_no_punct_ngram_skip[[7]], 10)
##  [1] "Developed_Hadley_built_pkgdown"  "Developed_Hadley_built_2.0.7"   
##  [3] "Developed_Wickham_built_pkgdown" "Developed_Wickham_built_2.0.7"  
##  [5] "Developed_Wickham_with_2.0.7"    "by_Wickham_built_pkgdown"       
##  [7] "by_Wickham_built_2.0.7"          "by_Wickham_with_2.0.7"          
##  [9] "by_Site_with_2.0.7"              "Hadley_Site_with_2.0.7"

7.8.6.4 Dictionary

If you have a dictionary with various words that fall within a generic word (e.g., variants of pronunciation of a word), then you can look these up. Here, we will create a dictionary that we populate ourselves and we show how to use it to search for items

7.8.6.4.1 Create dictionary
dict_web_pages_txt <- dictionary(list(tidy_family = c("tidy*", "ggplot**"),
                        r_packages = "*r"))
print(dict_web_pages_txt)
## Dictionary object with 2 key entries.
## - [tidy_family]:
##   - tidy*, ggplot**
## - [r_packages]:
##   - *r
7.8.6.4.2 Token lookup
web_pages_txt_corpus_tok_no_punct_dict_toks <- tokens_lookup(web_pages_txt_corpus_tok_no_punct, dictionary = dict_web_pages_txt)
print(web_pages_txt_corpus_tok_no_punct_dict_toks)
## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "tidy_family" "r_packages"  "tidy_family" "tidy_family" "tidy_family"
##  [6] "tidy_family" "r_packages"  "r_packages"  "tidy_family" "tidy_family"
## [11] "tidy_family" "tidy_family"
## [ ... and 62 more ]
## 
## text2 :
##  [1] "r_packages" "r_packages" "r_packages" "r_packages" "r_packages"
##  [6] "r_packages" "r_packages" "r_packages" "r_packages" "r_packages"
## [11] "r_packages" "r_packages"
## [ ... and 96 more ]
## 
## text3 :
##  [1] "tidy_family" "tidy_family" "tidy_family" "r_packages"  "tidy_family"
##  [6] "r_packages"  "tidy_family" "tidy_family" "tidy_family" "r_packages" 
## [11] "tidy_family" "r_packages" 
## [ ... and 68 more ]
## 
## text4 :
##  [1] "r_packages"  "r_packages"  "r_packages"  "r_packages"  "r_packages" 
##  [6] "r_packages"  "r_packages"  "r_packages"  "tidy_family" "tidy_family"
## [11] "r_packages"  "r_packages" 
## [ ... and 110 more ]
## 
## text5 :
##  [1] "r_packages" "r_packages" "r_packages" "r_packages" "r_packages"
##  [6] "r_packages" "r_packages" "r_packages" "r_packages" "r_packages"
## [11] "r_packages" "r_packages"
## [ ... and 31 more ]
## 
## text6 :
##  [1] "r_packages"  "r_packages"  "r_packages"  "r_packages"  "r_packages" 
##  [6] "r_packages"  "r_packages"  "r_packages"  "r_packages"  "r_packages" 
## [11] "tidy_family" "tidy_family"
## [ ... and 16 more ]
## 
## [ reached max_ndoc ... 3 more documents ]
7.8.6.4.3 DFM
dfm(web_pages_txt_corpus_tok_no_punct_dict_toks)
## Document-feature matrix of: 9 documents, 2 features (0.00% sparse) and 4 docvars.
##        features
## docs    tidy_family r_packages
##   text1          33         41
##   text2           3        105
##   text3          29         51
##   text4          10        112
##   text5           3         40
##   text6           3         25
## [ reached max_ndoc ... 3 more documents ]

7.8.6.5 Part of Speech tagging

Part-of-Speech tagging (or PoS-Tagging) is used to distinguish different part of speech, e.g., the sentence: “Jane likes the girl” can be tagged as “Jane/NNP likes/VBZ the/DT girl/NN”, where NNP = proper noun (singular), VBZ = 3rd person singular present tense verb, DT = determiner, and NN = noun (singular or mass). We will use the udpipe package

7.8.6.5.1 Download and load language model

Before using the PoS-tagger, we need to download a language model. As you can see from typing ?udpipe_download_model, there are 65 languages trained on 101 treebanks from here

file_to_check <- "models/english-ewt-ud-2.5-191206.udpipe"

if (file.exists(file = file_to_check)){
  m_english <- udpipe_load_model(file = "models/english-ewt-ud-2.5-191206.udpipe")
}else{
  m_english <- udpipe_download_model(model_dir = "models/", language = "english-ewt")
  m_english <- udpipe_load_model(file = "models/english-ewt-ud-2.5-191206.udpipe")
}
7.8.6.5.2 Tokenise, tag, dependency parsing

We use the already tokenised text, with no punctuations.

web_pages_txt_anndf <- udpipe_annotate(m_english, x = web_pages_txt_corpus_tok_no_punct[[1]]) %>%
  as.data.frame() 
## inspect
head(web_pages_txt_anndf, 10)
##    doc_id paragraph_id sentence_id     sentence token_id        token
## 1    doc1            1           1         Skip        1         Skip
## 2    doc2            1           1           to        1           to
## 3    doc3            1           1      content        1      content
## 4    doc4            1           1 ggplot23.5.2        1 ggplot23.5.2
## 5    doc5            1           1          Get        1          Get
## 6    doc6            1           1      started        1      started
## 7    doc7            1           1    Reference        1    Reference
## 8    doc8            1           1         News        1         News
## 9    doc9            1           1     Releases        1     Releases
## 10  doc10            1           1      Version        1      Version
##           lemma upos xpos                            feats head_token_id
## 1          skip NOUN   NN                      Number=Sing             0
## 2            to  ADP   IN                             <NA>             0
## 3       content NOUN   NN                      Number=Sing             0
## 4  ggplot23.5.2 NOUN   NN                      Number=Sing             0
## 5           get VERB   VB            Mood=Imp|VerbForm=Fin             0
## 6         start VERB  VBD Mood=Ind|Tense=Past|VerbForm=Fin             0
## 7     reference NOUN   NN                      Number=Sing             0
## 8          news NOUN   NN                      Number=Sing             0
## 9      releases NOUN  NNS                      Number=Plur             0
## 10      version NOUN   NN                      Number=Sing             0
##    dep_rel deps            misc
## 1     root <NA> SpacesAfter=\\n
## 2     root <NA> SpacesAfter=\\n
## 3     root <NA> SpacesAfter=\\n
## 4     root <NA> SpacesAfter=\\n
## 5     root <NA> SpacesAfter=\\n
## 6     root <NA> SpacesAfter=\\n
## 7     root <NA> SpacesAfter=\\n
## 8     root <NA> SpacesAfter=\\n
## 9     root <NA> SpacesAfter=\\n
## 10    root <NA> SpacesAfter=\\n
7.8.6.5.3 Dependency parsing
## parse text
web_pages_txt_corpus_sent <- udpipe_annotate(m_english, x = web_pages_txt_corpus[[1]]) %>%
  as.data.frame()
## inspect
head(web_pages_txt_corpus_sent)
##   doc_id paragraph_id sentence_id
## 1   doc1            1           1
## 2   doc1            1           1
## 3   doc1            1           1
## 4   doc1            1           1
## 5   doc1            1           1
## 6   doc1            1           1
##                                                                                                                                                                                                                                                                               sentence
## 1 Skip to content ggplot23.5.2 Get started Reference News Releases Version 3.5.0 Version 3.4.0 Version 3.3.0 Version 3.2.0 Version 3.1.0 Version 3.0.0 Version 2.2.0 Version 2.1.0 Version 2.0.0 Version 1.0.0 Changelog Articles Aesthetic specifications Developer Extending ggplot2
## 2 Skip to content ggplot23.5.2 Get started Reference News Releases Version 3.5.0 Version 3.4.0 Version 3.3.0 Version 3.2.0 Version 3.1.0 Version 3.0.0 Version 2.2.0 Version 2.1.0 Version 2.0.0 Version 1.0.0 Changelog Articles Aesthetic specifications Developer Extending ggplot2
## 3 Skip to content ggplot23.5.2 Get started Reference News Releases Version 3.5.0 Version 3.4.0 Version 3.3.0 Version 3.2.0 Version 3.1.0 Version 3.0.0 Version 2.2.0 Version 2.1.0 Version 2.0.0 Version 1.0.0 Changelog Articles Aesthetic specifications Developer Extending ggplot2
## 4 Skip to content ggplot23.5.2 Get started Reference News Releases Version 3.5.0 Version 3.4.0 Version 3.3.0 Version 3.2.0 Version 3.1.0 Version 3.0.0 Version 2.2.0 Version 2.1.0 Version 2.0.0 Version 1.0.0 Changelog Articles Aesthetic specifications Developer Extending ggplot2
## 5 Skip to content ggplot23.5.2 Get started Reference News Releases Version 3.5.0 Version 3.4.0 Version 3.3.0 Version 3.2.0 Version 3.1.0 Version 3.0.0 Version 2.2.0 Version 2.1.0 Version 2.0.0 Version 1.0.0 Changelog Articles Aesthetic specifications Developer Extending ggplot2
## 6 Skip to content ggplot23.5.2 Get started Reference News Releases Version 3.5.0 Version 3.4.0 Version 3.3.0 Version 3.2.0 Version 3.1.0 Version 3.0.0 Version 2.2.0 Version 2.1.0 Version 2.0.0 Version 1.0.0 Changelog Articles Aesthetic specifications Developer Extending ggplot2
##   token_id        token        lemma upos xpos                    feats
## 1        1         Skip         skip VERB   VB    Mood=Imp|VerbForm=Fin
## 2        2           to           to PART   TO                     <NA>
## 3        3      content      contend VERB   VB             VerbForm=Inf
## 4        4 ggplot23.5.2 ggplot23.5.2 PART   TO                 Abbr=Yes
## 5        5          Get          get VERB   VB             VerbForm=Inf
## 6        6      started        start VERB  VBN Tense=Past|VerbForm=Part
##   head_token_id dep_rel deps            misc
## 1             0    root <NA>            <NA>
## 2             3    mark <NA>            <NA>
## 3             1   xcomp <NA> SpacesAfter=\\n
## 4             5    mark <NA> SpacesAfter=\\n
## 5             3   advcl <NA>            <NA>
## 6             8    amod <NA> SpacesAfter=\\n
web_pages_txt_corpus_sent_dplot <- textplot_dependencyparser(web_pages_txt_corpus_sent, size = 3) 
## show plot
web_pages_txt_corpus_sent_dplot

7.8.6.6 Feature co-occurrence matrix (FCM)

Feature co-occurrence matrix (FCM) records the number of co-occurrences of tokens

7.8.6.6.1 Computing number of co-occurrences
web_pages_txt_corpus_dfmat <- dfm(web_pages_txt_corpus_tok_no_punct)
web_pages_txt_corpus_dfmat_trim <- dfm_trim(web_pages_txt_corpus_dfmat, min_termfreq = 5)

topfeatures_web_pages_txt_corpus <- topfeatures(web_pages_txt_corpus_dfmat_trim)
topfeatures_web_pages_txt_corpus
##   > the  to   a   ─  of   < and   1  is 
## 271 186 147 138 132  91  91  78  72  66
nfeat(web_pages_txt_corpus_dfmat_trim)
## [1] 257
7.8.6.6.2 Features co-occurrences
web_pages_txt_corpus_fcmat <- fcm(web_pages_txt_corpus_dfmat_trim)
web_pages_txt_corpus_fcmat
## Feature co-occurrence matrix of: 257 by 257 features.
##            features
## features    skip   to content get started reference news releases version 2.0.0
##   skip         0  147       9  22       9         9    9       10      55     7
##   to           0 1459     147 415     145       147  147      183     868   216
##   content      0    0       0  22       9         9    9       10      55     7
##   get          0    0       0  22      23        22   22       25     141    22
##   started      0    0       0   0       1         9    9       10      52     7
##   reference    0    0       0   0       0         0    9       10      55     7
##   news         0    0       0   0       0         0    0       10      55     7
##   releases     0    0       0   0       0         0    0        1      57    12
##   version      0    0       0   0       0         0    0        0     217    28
##   2.0.0        0    0       0   0       0         0    0        0       0    10
## [ reached max_nfeat ... 247 more features, reached max_nfeat ... 247 more features ]