6.8 Multiple webpages

6.8.1 Read_html

website <- "https://www.tidyverse.org/packages/" %>% 
  rvest::read_html()
website
## {html_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n  <div id="appTidyverseSite" class="shrinkHeader alwaysShrinkHead ...
a_elements <- website %>% 
  rvest::html_elements(css = "div.package > a")
a_elements
## {xml_nodeset (9)}
## [1] <a href="https://ggplot2.tidyverse.org/" target="_blank">\n    <img class ...
## [2] <a href="https://dplyr.tidyverse.org/" target="_blank">\n    <img class=" ...
## [3] <a href="https://tidyr.tidyverse.org/" target="_blank">\n    <img class=" ...
## [4] <a href="https://readr.tidyverse.org/" target="_blank">\n    <img class=" ...
## [5] <a href="https://purrr.tidyverse.org/" target="_blank">\n    <img class=" ...
## [6] <a href="https://tibble.tidyverse.org/" target="_blank">\n    <img class= ...
## [7] <a href="https://stringr.tidyverse.org/" target="_blank">\n    <img class ...
## [8] <a href="https://forcats.tidyverse.org/" target="_blank">\n    <img class ...
## [9] <a href="https://lubridate.tidyverse.org/" target="_blank">\n    <img cla ...

6.8.2 Extract headline

links <- a_elements %>%
  rvest::html_attr(name = "href")
links
## [1] "https://ggplot2.tidyverse.org/"   "https://dplyr.tidyverse.org/"    
## [3] "https://tidyr.tidyverse.org/"     "https://readr.tidyverse.org/"    
## [5] "https://purrr.tidyverse.org/"     "https://tibble.tidyverse.org/"   
## [7] "https://stringr.tidyverse.org/"   "https://forcats.tidyverse.org/"  
## [9] "https://lubridate.tidyverse.org/"

6.8.3 Extract subpages

pages <- links %>% 
  map(rvest::read_html)
pages
## [[1]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[2]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[3]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[4]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[5]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[6]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[7]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[8]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[9]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...

The structure seems to be similar across all pages

pages %>% 
  map(rvest::html_element, css = "a.navbar-brand") %>% 
  map_chr(rvest::html_text)
## [1] "ggplot2"   "dplyr"     "tidyr"     "readr"     "purrr"     "tibble"   
## [7] "stringr"   "forcats"   "lubridate"

and extracting version number

pages %>% 
  map(rvest::html_element, css = "small.nav-text.text-muted.me-auto") %>% 
  map_chr(rvest::html_text)
## [1] "3.5.2" "1.1.4" "1.3.1" "2.1.5" "1.1.0" "3.3.0" "1.5.1" "1.0.0" "1.9.4"

and we can also add all into a tibble

6.8.4 Extract text

pages_table <- tibble(
  name = pages %>% 
    map(rvest::html_element, css = "a.navbar-brand") %>% 
    map_chr(rvest::html_text),
  version = pages %>% 
    map(rvest::html_element, css = "small.nav-text.text-muted.me-auto") %>% 
    map_chr(rvest::html_text),
  CRAN = pages %>% 
    map(rvest::html_element, css = "ul.list-unstyled > li:nth-child(1) > a") %>% 
    map_chr(rvest::html_attr, name = "href"),
  Learn = pages %>% 
    map(rvest::html_element, css = "ul.list-unstyled > li:nth-child(4) > a") %>% 
    map_chr(rvest::html_attr, name = "href"), 
  text = pages %>%
    map(rvest::html_element,  css = "body") %>%
    map_chr(rvest::html_text2)
)
pages_table
## # A tibble: 9 × 5
##   name      version CRAN                                          Learn    text 
##   <chr>     <chr>   <chr>                                         <chr>    <chr>
## 1 ggplot2   3.5.2   https://cloud.r-project.org/package=ggplot2   https:/… "Ski…
## 2 dplyr     1.1.4   https://cloud.r-project.org/package=dplyr     http://… "Ski…
## 3 tidyr     1.3.1   https://cloud.r-project.org/package=tidyr     https:/… "Ski…
## 4 readr     2.1.5   https://cloud.r-project.org/package=readr     http://… "Ski…
## 5 purrr     1.1.0   https://cloud.r-project.org/package=purrr     http://… "Ski…
## 6 tibble    3.3.0   https://cloud.r-project.org/package=tibble    https:/… "Ski…
## 7 stringr   1.5.1   https://cloud.r-project.org/package=stringr   http://… "Ski…
## 8 forcats   1.0.0   https://cloud.r-project.org/package=forcats   http://… "Ski…
## 9 lubridate 1.9.4   https://cloud.r-project.org/package=lubridate https:/… "Ski…

6.8.5 Create a corpus

web_pages_txt_corpus <- corpus(pages_table)
print(web_pages_txt_corpus)
## Corpus consisting of 9 documents and 4 docvars.
## text1 :
## "Skip to content ggplot23.5.2 Get started Reference News Rele..."
## 
## text2 :
## "Skip to content dplyr1.1.4 Get started Reference Articles Gr..."
## 
## text3 :
## "Skip to content tidyr1.3.1 Tidy data Reference Articles Pivo..."
## 
## text4 :
## "Skip to content readr2.1.5 Get started Reference Articles Co..."
## 
## text5 :
## "Skip to content purrr1.1.0 Reference Articles purrr <-> base..."
## 
## text6 :
## "Skip to content tibble3.3.0 Get started Reference Articles C..."
## 
## [ reached max_ndoc ... 3 more documents ]

6.8.5.1 Summary

summary(web_pages_txt_corpus, 10)
## Corpus consisting of 9 documents, showing 9 documents:
## 
##   Text Types Tokens Sentences      name version
##  text1   368    777        24   ggplot2   3.5.2
##  text2   419   1258        17     dplyr   1.1.4
##  text3   326    729        25     tidyr   1.3.1
##  text4   571   1745        47     readr   2.1.5
##  text5   248    495        11     purrr   1.1.0
##  text6   269    717        14    tibble   3.3.0
##  text7   398   1345        23   stringr   1.5.1
##  text8   264    648        14   forcats   1.0.0
##  text9   267    650        11 lubridate   1.9.4
##                                           CRAN
##    https://cloud.r-project.org/package=ggplot2
##      https://cloud.r-project.org/package=dplyr
##      https://cloud.r-project.org/package=tidyr
##      https://cloud.r-project.org/package=readr
##      https://cloud.r-project.org/package=purrr
##     https://cloud.r-project.org/package=tibble
##    https://cloud.r-project.org/package=stringr
##    https://cloud.r-project.org/package=forcats
##  https://cloud.r-project.org/package=lubridate
##                                           Learn
##  https://r4ds.had.co.nz/data-visualisation.html
##            http://r4ds.had.co.nz/transform.html
##                https://r4ds.hadley.nz/data-tidy
##          http://r4ds.had.co.nz/data-import.html
##            http://r4ds.had.co.nz/iteration.html
##             https://r4ds.had.co.nz/tibbles.html
##              http://r4ds.hadley.nz/strings.html
##              http://r4ds.had.co.nz/factors.html
##           https://r4ds.hadley.nz/datetimes.html

6.8.5.2 Accessing parts of corpus

web_pages_txt_corpus[[4]]
## [1] "Skip to content\nreadr2.1.5\nGet started\nReference\nArticles\nColumn type Locales\nNews\nReleases\nVersion 2.1.0Version 2.0.0Version 1.4.0Version 1.3.1Version 1.0.0Version 0.2.0Version 0.1.0\nChangelog\nreadr\nOverview\n\nThe goal of readr is to provide a fast and friendly way to read rectangular data from delimited files, such as comma-separated values (CSV) and tab-separated values (TSV). It is designed to parse many types of data found in the wild, while providing an informative problem report when parsing leads to unexpected results. If you are new to readr, the best place to start is the data import chapter in R for Data Science.\n\nInstallation\n\n# The easiest way to get readr is to install the whole tidyverse:\ninstall.packages(\"tidyverse\")\n\n# Alternatively, install just readr:\ninstall.packages(\"readr\")\nCheatsheet\n\nUsage\n\nreadr is part of the core tidyverse, so you can load it with:\n\n\nlibrary(tidyverse)\n#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──\n#> ✔ dplyr     1.1.4          ✔ readr     2.1.4.9000\n#> ✔ forcats   1.0.0          ✔ stringr   1.5.1     \n#> ✔ ggplot2   3.4.3          ✔ tibble    3.2.1     \n#> ✔ lubridate 1.9.3          ✔ tidyr     1.3.0     \n#> ✔ purrr     1.0.2          \n#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n#> ✖ dplyr::filter() masks stats::filter()\n#> ✖ dplyr::lag()    masks stats::lag()\n#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors\n\nOf course, you can also load readr as an individual package:\n\n\nlibrary(readr)\n\nTo read a rectangular dataset with readr, you combine two pieces: a function that parses the lines of the file into individual fields and a column specification.\n\nreadr supports the following file formats with these read_*() functions:\n\nread_csv(): comma-separated values (CSV)\nread_tsv(): tab-separated values (TSV)\nread_csv2(): semicolon-separated values with , as the decimal mark\nread_delim(): delimited files (CSV and TSV are important special cases)\nread_fwf(): fixed-width files\nread_table(): whitespace-separated files\nread_log(): web log files\n\nA column specification describes how each column should be converted from a character vector to a specific data type (e.g. character, numeric, datetime, etc.). In the absence of a column specification, readr will guess column types from the data. vignette(\"column-types\") gives more detail on how readr guesses the column types. Column type guessing is very handy, especially during data exploration, but it’s important to remember these are just guesses. As any data analysis project matures past the exploratory phase, the best strategy is to provide explicit column types.\n\nThe following example loads a sample file bundled with readr and guesses the column types:\n\n\n(chickens <- read_csv(readr_example(\"chickens.csv\")))\n#> Rows: 5 Columns: 4\n#> ── Column specification ────────────────────────────────────────────────────────\n#> Delimiter: \",\"\n#> chr (3): chicken, sex, motto\n#> dbl (1): eggs_laid\n#> \n#> ℹ Use `spec()` to retrieve the full column specification for this data.\n#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.\n#> # A tibble: 5 × 4\n#>   chicken                 sex     eggs_laid motto                               \n#>   <chr>                   <chr>       <dbl> <chr>                               \n#> 1 Foghorn Leghorn         rooster         0 That's a joke, ah say, that's a jok…\n#> 2 Chicken Little          hen             3 The sky is falling!                 \n#> 3 Ginger                  hen            12 Listen. We'll either die free chick…\n#> 4 Camilla the Chicken     hen             7 Bawk, buck, ba-gawk.                \n#> 5 Ernie The Giant Chicken rooster         0 Put Captain Solo in the cargo hold.\n\nNote that readr prints the column types – the guessed column types, in this case. This is useful because it allows you to check that the columns have been read in as you expect. If they haven’t, that means you need to provide the column specification. This sounds like a lot of trouble, but luckily readr affords a nice workflow for this. Use spec() to retrieve the (guessed) column specification from your initial effort.\n\n\nspec(chickens)\n#> cols(\n#>   chicken = col_character(),\n#>   sex = col_character(),\n#>   eggs_laid = col_double(),\n#>   motto = col_character()\n#> )\n\nNow you can copy, paste, and tweak this, to create a more explicit readr call that expresses the desired column types. Here we express that sex should be a factor with levels rooster and hen, in that order, and that eggs_laid should be integer.\n\n\nchickens <- read_csv(\n  readr_example(\"chickens.csv\"),\n  col_types = cols(\n    chicken   = col_character(),\n    sex       = col_factor(levels = c(\"rooster\", \"hen\")),\n    eggs_laid = col_integer(),\n    motto     = col_character()\n  )\n)\nchickens\n#> # A tibble: 5 × 4\n#>   chicken                 sex     eggs_laid motto                               \n#>   <chr>                   <fct>       <int> <chr>                               \n#> 1 Foghorn Leghorn         rooster         0 That's a joke, ah say, that's a jok…\n#> 2 Chicken Little          hen             3 The sky is falling!                 \n#> 3 Ginger                  hen            12 Listen. We'll either die free chick…\n#> 4 Camilla the Chicken     hen             7 Bawk, buck, ba-gawk.                \n#> 5 Ernie The Giant Chicken rooster         0 Put Captain Solo in the cargo hold.\n\nvignette(\"readr\") gives an expanded introduction to readr.\n\nEditions\n\nreadr got a new parsing engine in version 2.0.0 (released July 2021). In this so-called second edition, readr calls vroom::vroom(), by default.\n\nThe parsing engine in readr versions prior to 2.0.0 is now called the first edition. If you’re using readr >= 2.0.0, you can still access first edition parsing via the functions with_edition(1, ...) and local_edition(1). And, obviously, if you’re using readr < 2.0.0, you will get first edition parsing, by definition, because that’s all there is.\n\nWe will continue to support the first edition for a number of releases, but the overall goal is to make the second edition uniformly better than the first. Therefore the plan is to eventually deprecate and then remove the first edition code. New code and actively-maintained code should use the second edition. The workarounds with_edition(1, ...) and local_edition(1) are offered as a pragmatic way to patch up legacy code or as a temporary solution for infelicities identified as the second edition matures.\n\nAlternatives\n\nThere are two main alternatives to readr: base R and data.table’s fread(). The most important differences are discussed below.\n\nBase R\n\nCompared to the corresponding base functions, readr functions:\n\nUse a consistent naming scheme for the parameters (e.g. col_names and col_types not header and colClasses).\n\nAre generally much faster (up to 10x-100x) depending on the dataset.\n\nLeave strings as is by default, and automatically parse common date/time formats.\n\nHave a helpful progress bar if loading is going to take a while.\n\nAll functions work exactly the same way regardless of the current locale. To override the US-centric defaults, use locale().\n\ndata.table and fread()\n\ndata.table has a function similar to read_csv() called fread(). Compared to fread(), readr functions:\n\nAre sometimes slower, particularly on numeric heavy data.\n\nCan automatically guess some parameters, but basically encourage explicit specification of, e.g., the delimiter, skipped rows, and the header row.\n\nFollow tidyverse-wide conventions, such as returning a tibble, a standard approach for column name repair, and a common mini-language for column selection.\n\nAcknowledgements\n\nThanks to:\n\nJoe Cheng for showing me the beauty of deterministic finite automata for parsing, and for teaching me why I should write a tokenizer.\n\nJJ Allaire for helping me come up with a design that makes very few copies, and is easy to extend.\n\nDirk Eddelbuettel for coming up with the name!\n\nLinks\nView on CRAN\nBrowse source code\nReport a bug\nLearn more\nLicense\nFull license\nMIT + file LICENSE\nCommunity\nContributing guide\nCode of conduct\nGetting help\nCitation\nCiting readr\nDevelopers\nHadley Wickham\nAuthor\nJim Hester\nAuthor\nJennifer Bryan\nAuthor, maintainer\n\nCopyright holder, funder\nMore about authors...\n\nDeveloped by Hadley Wickham, Jim Hester, Jennifer Bryan, .\n\nSite built with pkgdown 2.0.7."

6.8.5.3 Document-level information

head(docvars(web_pages_txt_corpus))
##      name version                                        CRAN
## 1 ggplot2   3.5.2 https://cloud.r-project.org/package=ggplot2
## 2   dplyr   1.1.4   https://cloud.r-project.org/package=dplyr
## 3   tidyr   1.3.1   https://cloud.r-project.org/package=tidyr
## 4   readr   2.1.5   https://cloud.r-project.org/package=readr
## 5   purrr   1.1.0   https://cloud.r-project.org/package=purrr
## 6  tibble   3.3.0  https://cloud.r-project.org/package=tibble
##                                            Learn
## 1 https://r4ds.had.co.nz/data-visualisation.html
## 2           http://r4ds.had.co.nz/transform.html
## 3               https://r4ds.hadley.nz/data-tidy
## 4         http://r4ds.had.co.nz/data-import.html
## 5           http://r4ds.had.co.nz/iteration.html
## 6            https://r4ds.had.co.nz/tibbles.html

6.8.6 Basic manipulations

By default, a corpus is created based on the “documents” (= lines). We can reshape it to show “sentences” or “paragraphs”.

6.8.6.1 Sentences

6.8.6.1.1 Transform
web_pages_txt_corpus_sent <- corpus_reshape(web_pages_txt_corpus, to = "sentences")
web_pages_txt_corpus_sent
## Corpus consisting of 186 documents and 4 docvars.
## text1.1 :
## "Skip to content ggplot23.5.2 Get started Reference News Rele..."
## 
## text1.2 :
## "You provide the data, tell ggplot2 how to map variables to a..."
## 
## text1.3 :
## "Installation  # The easiest way to get ggplot2 is to install..."
## 
## text1.4 :
## "However, in most cases you start with ggplot(), supply a dat..."
## 
## text1.5 :
## "You then add on layers (like geom_point() or geom_histogram(..."
## 
## text1.6 :
## "That means, by-and-large, ggplot2 itself changes relatively ..."
## 
## [ reached max_ndoc ... 180 more documents ]
6.8.6.1.2 Summary
summary(web_pages_txt_corpus_sent)
## Corpus consisting of 186 documents, showing 100 documents:
## 
##      Text Types Tokens Sentences    name version
##   text1.1    59     82         1 ggplot2   3.5.2
##   text1.2    23     28         1 ggplot2   3.5.2
##   text1.3    45     79         1 ggplot2   3.5.2
##   text1.4    19     25         1 ggplot2   3.5.2
##   text1.5    50     88         1 ggplot2   3.5.2
##   text1.6    10     11         1 ggplot2   3.5.2
##   text1.7    30     42         1 ggplot2   3.5.2
##   text1.8    17     17         1 ggplot2   3.5.2
##   text1.9     7      7         1 ggplot2   3.5.2
##  text1.10    25     29         1 ggplot2   3.5.2
##  text1.11    21     22         1 ggplot2   3.5.2
##  text1.12    31     36         1 ggplot2   3.5.2
##  text1.13    20     20         1 ggplot2   3.5.2
##  text1.14    18     18         1 ggplot2   3.5.2
##  text1.15    23     24         1 ggplot2   3.5.2
##  text1.16    12     12         1 ggplot2   3.5.2
##  text1.17    20     20         1 ggplot2   3.5.2
##  text1.18    16     17         1 ggplot2   3.5.2
##  text1.19    25     26         1 ggplot2   3.5.2
##  text1.20    24     27         1 ggplot2   3.5.2
##  text1.21    13     13         1 ggplot2   3.5.2
##  text1.22    21     22         1 ggplot2   3.5.2
##  text1.23    63    106         1 ggplot2   3.5.2
##  text1.24     6      6         1 ggplot2   3.5.2
##   text2.1    27     31         1   dplyr   1.1.4
##   text2.2    66    101         1   dplyr   1.1.4
##   text2.3    20     20         1   dplyr   1.1.4
##   text2.4    13     14         1   dplyr   1.1.4
##   text2.5    25     29         1   dplyr   1.1.4
##   text2.6    21     23         1   dplyr   1.1.4
##   text2.7    20     20         1   dplyr   1.1.4
##   text2.8    33     40         1   dplyr   1.1.4
##   text2.9    17     19         1   dplyr   1.1.4
##  text2.10    20     21         1   dplyr   1.1.4
##  text2.11    36     44         1   dplyr   1.1.4
##  text2.12   194    778         1   dplyr   1.1.4
##  text2.13    15     15         1   dplyr   1.1.4
##  text2.14    14     14         1   dplyr   1.1.4
##  text2.15    13     13         1   dplyr   1.1.4
##  text2.16    50     70         1   dplyr   1.1.4
##  text2.17     6      6         1   dplyr   1.1.4
##   text3.1    14     17         1   tidyr   1.3.1
##   text3.2    32     44         1   tidyr   1.3.1
##   text3.3    12     18         1   tidyr   1.3.1
##   text3.4     9     12         1   tidyr   1.3.1
##   text3.5     9     13         1   tidyr   1.3.1
##   text3.6    17     18         1   tidyr   1.3.1
##   text3.7    23     25         1   tidyr   1.3.1
##   text3.8    12     13         1   tidyr   1.3.1
##   text3.9    57    109         1   tidyr   1.3.1
##  text3.10    10     11         1   tidyr   1.3.1
##  text3.11    18     18         1   tidyr   1.3.1
##  text3.12    15     24         1   tidyr   1.3.1
##  text3.13    23     26         1   tidyr   1.3.1
##  text3.14    13     20         1   tidyr   1.3.1
##  text3.15     6      6         1   tidyr   1.3.1
##  text3.16    22     37         1   tidyr   1.3.1
##  text3.17    24     42         1   tidyr   1.3.1
##  text3.18    12     14         1   tidyr   1.3.1
##  text3.19   110    153         1   tidyr   1.3.1
##  text3.20    10     10         1   tidyr   1.3.1
##  text3.21    15     15         1   tidyr   1.3.1
##  text3.22    13     14         1   tidyr   1.3.1
##  text3.23    47     52         1   tidyr   1.3.1
##  text3.24    10     12         1   tidyr   1.3.1
##  text3.25     6      6         1   tidyr   1.3.1
##   text4.1    51     58         1   readr   2.1.5
##   text4.2    26     27         1   readr   2.1.5
##   text4.3    21     23         1   readr   2.1.5
##   text4.4   148    388         1   readr   2.1.5
##   text4.5    26     34         1   readr   2.1.5
##   text4.6    21     22         1   readr   2.1.5
##   text4.7    20     21         1   readr   2.1.5
##   text4.8    63    167         1   readr   2.1.5
##   text4.9    39     59         1   readr   2.1.5
##  text4.10     8      8         1   readr   2.1.5
##  text4.11    19     20         1   readr   2.1.5
##  text4.12    17     17         1   readr   2.1.5
##  text4.13    14     17         1   readr   2.1.5
##  text4.14    19     20         1   readr   2.1.5
##  text4.15    14     14         1   readr   2.1.5
##  text4.16    17     18         1   readr   2.1.5
##  text4.17    45     84         1   readr   2.1.5
##  text4.18    69    139         1   readr   2.1.5
##  text4.19     8      8         1   readr   2.1.5
##  text4.20    19     20         1   readr   2.1.5
##  text4.21    27     30         1   readr   2.1.5
##  text4.22    16     16         1   readr   2.1.5
##  text4.23    15     18         1   readr   2.1.5
##  text4.24    16     16         1   readr   2.1.5
##  text4.25    25     32         1   readr   2.1.5
##  text4.26    23     27         1   readr   2.1.5
##  text4.27    24     30         1   readr   2.1.5
##  text4.28    14     15         1   readr   2.1.5
##  text4.29    10     11         1   readr   2.1.5
##  text4.30    31     40         1   readr   2.1.5
##  text4.31    17     17         1   readr   2.1.5
##  text4.32     8      8         1   readr   2.1.5
##  text4.33    28     32         1   readr   2.1.5
##  text4.34    14     14         1   readr   2.1.5
##                                         CRAN
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##                                           Learn
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
6.8.6.1.3 Subset

We can subset sentences with 10 or more words

web_pages_txt_corpus_sent_long <- corpus_subset(web_pages_txt_corpus_sent, ntoken(web_pages_txt_corpus_sent) >= 10)
ndoc(web_pages_txt_corpus_sent_long)
## [1] 164
summary(web_pages_txt_corpus_sent_long)
## Corpus consisting of 164 documents, showing 100 documents:
## 
##      Text Types Tokens Sentences    name version
##   text1.1    59     82         1 ggplot2   3.5.2
##   text1.2    23     28         1 ggplot2   3.5.2
##   text1.3    45     79         1 ggplot2   3.5.2
##   text1.4    19     25         1 ggplot2   3.5.2
##   text1.5    50     88         1 ggplot2   3.5.2
##   text1.6    10     11         1 ggplot2   3.5.2
##   text1.7    30     42         1 ggplot2   3.5.2
##   text1.8    17     17         1 ggplot2   3.5.2
##  text1.10    25     29         1 ggplot2   3.5.2
##  text1.11    21     22         1 ggplot2   3.5.2
##  text1.12    31     36         1 ggplot2   3.5.2
##  text1.13    20     20         1 ggplot2   3.5.2
##  text1.14    18     18         1 ggplot2   3.5.2
##  text1.15    23     24         1 ggplot2   3.5.2
##  text1.16    12     12         1 ggplot2   3.5.2
##  text1.17    20     20         1 ggplot2   3.5.2
##  text1.18    16     17         1 ggplot2   3.5.2
##  text1.19    25     26         1 ggplot2   3.5.2
##  text1.20    24     27         1 ggplot2   3.5.2
##  text1.21    13     13         1 ggplot2   3.5.2
##  text1.22    21     22         1 ggplot2   3.5.2
##  text1.23    63    106         1 ggplot2   3.5.2
##   text2.1    27     31         1   dplyr   1.1.4
##   text2.2    66    101         1   dplyr   1.1.4
##   text2.3    20     20         1   dplyr   1.1.4
##   text2.4    13     14         1   dplyr   1.1.4
##   text2.5    25     29         1   dplyr   1.1.4
##   text2.6    21     23         1   dplyr   1.1.4
##   text2.7    20     20         1   dplyr   1.1.4
##   text2.8    33     40         1   dplyr   1.1.4
##   text2.9    17     19         1   dplyr   1.1.4
##  text2.10    20     21         1   dplyr   1.1.4
##  text2.11    36     44         1   dplyr   1.1.4
##  text2.12   194    778         1   dplyr   1.1.4
##  text2.13    15     15         1   dplyr   1.1.4
##  text2.14    14     14         1   dplyr   1.1.4
##  text2.15    13     13         1   dplyr   1.1.4
##  text2.16    50     70         1   dplyr   1.1.4
##   text3.1    14     17         1   tidyr   1.3.1
##   text3.2    32     44         1   tidyr   1.3.1
##   text3.3    12     18         1   tidyr   1.3.1
##   text3.4     9     12         1   tidyr   1.3.1
##   text3.5     9     13         1   tidyr   1.3.1
##   text3.6    17     18         1   tidyr   1.3.1
##   text3.7    23     25         1   tidyr   1.3.1
##   text3.8    12     13         1   tidyr   1.3.1
##   text3.9    57    109         1   tidyr   1.3.1
##  text3.10    10     11         1   tidyr   1.3.1
##  text3.11    18     18         1   tidyr   1.3.1
##  text3.12    15     24         1   tidyr   1.3.1
##  text3.13    23     26         1   tidyr   1.3.1
##  text3.14    13     20         1   tidyr   1.3.1
##  text3.16    22     37         1   tidyr   1.3.1
##  text3.17    24     42         1   tidyr   1.3.1
##  text3.18    12     14         1   tidyr   1.3.1
##  text3.19   110    153         1   tidyr   1.3.1
##  text3.20    10     10         1   tidyr   1.3.1
##  text3.21    15     15         1   tidyr   1.3.1
##  text3.22    13     14         1   tidyr   1.3.1
##  text3.23    47     52         1   tidyr   1.3.1
##  text3.24    10     12         1   tidyr   1.3.1
##   text4.1    51     58         1   readr   2.1.5
##   text4.2    26     27         1   readr   2.1.5
##   text4.3    21     23         1   readr   2.1.5
##   text4.4   148    388         1   readr   2.1.5
##   text4.5    26     34         1   readr   2.1.5
##   text4.6    21     22         1   readr   2.1.5
##   text4.7    20     21         1   readr   2.1.5
##   text4.8    63    167         1   readr   2.1.5
##   text4.9    39     59         1   readr   2.1.5
##  text4.11    19     20         1   readr   2.1.5
##  text4.12    17     17         1   readr   2.1.5
##  text4.13    14     17         1   readr   2.1.5
##  text4.14    19     20         1   readr   2.1.5
##  text4.15    14     14         1   readr   2.1.5
##  text4.16    17     18         1   readr   2.1.5
##  text4.17    45     84         1   readr   2.1.5
##  text4.18    69    139         1   readr   2.1.5
##  text4.20    19     20         1   readr   2.1.5
##  text4.21    27     30         1   readr   2.1.5
##  text4.22    16     16         1   readr   2.1.5
##  text4.23    15     18         1   readr   2.1.5
##  text4.24    16     16         1   readr   2.1.5
##  text4.25    25     32         1   readr   2.1.5
##  text4.26    23     27         1   readr   2.1.5
##  text4.27    24     30         1   readr   2.1.5
##  text4.28    14     15         1   readr   2.1.5
##  text4.29    10     11         1   readr   2.1.5
##  text4.30    31     40         1   readr   2.1.5
##  text4.31    17     17         1   readr   2.1.5
##  text4.33    28     32         1   readr   2.1.5
##  text4.34    14     14         1   readr   2.1.5
##  text4.35    16     16         1   readr   2.1.5
##  text4.36    13     14         1   readr   2.1.5
##  text4.37    12     13         1   readr   2.1.5
##  text4.38    21     30         1   readr   2.1.5
##  text4.39    18     19         1   readr   2.1.5
##  text4.40    21     27         1   readr   2.1.5
##  text4.41    20     26         1   readr   2.1.5
##  text4.42    26     29         1   readr   2.1.5
##                                         CRAN
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##                                           Learn
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html

6.8.6.2 Paragraphs

6.8.6.2.1 Transform
web_pages_txt_corpus_para <- corpus_reshape(web_pages_txt_corpus, to = "paragraphs")
web_pages_txt_corpus_para
## Corpus consisting of 273 documents and 4 docvars.
## text1.1 :
## "Skip to content ggplot23.5.2 Get started Reference News Rele..."
## 
## text1.2 :
## "ggplot2 is a system for declaratively creating graphics, bas..."
## 
## text1.3 :
## "Installation"
## 
## text1.4 :
## "# The easiest way to get ggplot2 is to install the whole tid..."
## 
## text1.5 :
## "# Alternatively, install just ggplot2: install.packages("ggp..."
## 
## text1.6 :
## "# Or the development version from GitHub: # install.packages..."
## 
## [ reached max_ndoc ... 267 more documents ]
6.8.6.2.2 Summary
summary(web_pages_txt_corpus)
## Corpus consisting of 9 documents, showing 9 documents:
## 
##   Text Types Tokens Sentences      name version
##  text1   368    777        24   ggplot2   3.5.2
##  text2   419   1258        17     dplyr   1.1.4
##  text3   326    729        25     tidyr   1.3.1
##  text4   571   1745        47     readr   2.1.5
##  text5   248    495        11     purrr   1.1.0
##  text6   269    717        14    tibble   3.3.0
##  text7   398   1345        23   stringr   1.5.1
##  text8   264    648        14   forcats   1.0.0
##  text9   267    650        11 lubridate   1.9.4
##                                           CRAN
##    https://cloud.r-project.org/package=ggplot2
##      https://cloud.r-project.org/package=dplyr
##      https://cloud.r-project.org/package=tidyr
##      https://cloud.r-project.org/package=readr
##      https://cloud.r-project.org/package=purrr
##     https://cloud.r-project.org/package=tibble
##    https://cloud.r-project.org/package=stringr
##    https://cloud.r-project.org/package=forcats
##  https://cloud.r-project.org/package=lubridate
##                                           Learn
##  https://r4ds.had.co.nz/data-visualisation.html
##            http://r4ds.had.co.nz/transform.html
##                https://r4ds.hadley.nz/data-tidy
##          http://r4ds.had.co.nz/data-import.html
##            http://r4ds.had.co.nz/iteration.html
##             https://r4ds.had.co.nz/tibbles.html
##              http://r4ds.hadley.nz/strings.html
##              http://r4ds.had.co.nz/factors.html
##           https://r4ds.hadley.nz/datetimes.html
6.8.6.2.3 Subset

We can subset sentences with 10 or more words

web_pages_txt_corpus_para_long <- corpus_subset(web_pages_txt_corpus_para, ntoken(web_pages_txt_corpus_para) >= 10)
ndoc(web_pages_txt_corpus_para_long)
## [1] 211
summary(web_pages_txt_corpus_para_long)
## Corpus consisting of 211 documents, showing 100 documents:
## 
##      Text Types Tokens Sentences    name version
##   text1.1    44     66         1 ggplot2   3.5.2
##   text1.2    35     44         2 ggplot2   3.5.2
##   text1.4    17     20         1 ggplot2   3.5.2
##   text1.5    11     13         1 ggplot2   3.5.2
##   text1.6    17     27         1 ggplot2   3.5.2
##   text1.8    51     84         3 ggplot2   3.5.2
##  text1.10    13     19         1 ggplot2   3.5.2
##  text1.12    50     75         3 ggplot2   3.5.2
##  text1.13    24     24         2 ggplot2   3.5.2
##  text1.15    31     37         2 ggplot2   3.5.2
##  text1.16    35     48         2 ggplot2   3.5.2
##  text1.17    20     20         1 ggplot2   3.5.2
##  text1.18    18     18         1 ggplot2   3.5.2
##  text1.19    31     36         2 ggplot2   3.5.2
##  text1.20    49     63         3 ggplot2   3.5.2
##  text1.22    11     11         1 ggplot2   3.5.2
##  text1.23    14     14         1 ggplot2   3.5.2
##  text1.24    29     35         2 ggplot2   3.5.2
##  text1.25    57     66         1 ggplot2   3.5.2
##  text1.27    27     36         1 ggplot2   3.5.2
##   text2.1    42     53         2   dplyr   1.1.4
##   text2.2    21     25         1   dplyr   1.1.4
##   text2.3    34     54         1   dplyr   1.1.4
##   text2.4    43     63         3   dplyr   1.1.4
##   text2.5    21     23         1   dplyr   1.1.4
##   text2.7    26     27         2   dplyr   1.1.4
##   text2.8    22     25         1   dplyr   1.1.4
##   text2.9    15     17         2   dplyr   1.1.4
##  text2.10    15     16         2   dplyr   1.1.4
##  text2.11    30     33         2   dplyr   1.1.4
##  text2.12    15     15         1   dplyr   1.1.4
##  text2.13    10     10         1   dplyr   1.1.4
##  text2.15    17     20         1   dplyr   1.1.4
##  text2.16    13     15         1   dplyr   1.1.4
##  text2.17    21     26         1   dplyr   1.1.4
##  text2.18    12     20         1   dplyr   1.1.4
##  text2.21    72    182         1   dplyr   1.1.4
##  text2.22    50     95         1   dplyr   1.1.4
##  text2.23    58    107         1   dplyr   1.1.4
##  text2.24    80    181         1   dplyr   1.1.4
##  text2.25    49    107         1   dplyr   1.1.4
##  text2.26    30     34         2   dplyr   1.1.4
##  text2.27    24     27         2   dplyr   1.1.4
##  text2.28    44     48         1   dplyr   1.1.4
##  text2.30    14     18         1   dplyr   1.1.4
##   text3.1    35     49         2   tidyr   1.3.1
##   text3.2    15     18         2   tidyr   1.3.1
##   text3.3    14     37         3   tidyr   1.3.1
##   text3.4    43     56         3   tidyr   1.3.1
##   text3.6    17     20         1   tidyr   1.3.1
##   text3.7    11     13         1   tidyr   1.3.1
##   text3.8    17     27         1   tidyr   1.3.1
##  text3.12    32     45         2   tidyr   1.3.1
##  text3.13    29     42         2   tidyr   1.3.1
##  text3.14    33     46         2   tidyr   1.3.1
##  text3.15    24     43         2   tidyr   1.3.1
##  text3.16    24     42         1   tidyr   1.3.1
##  text3.18    32     47         2   tidyr   1.3.1
##  text3.19    10     12         1   tidyr   1.3.1
##  text3.20    21     21         1   tidyr   1.3.1
##  text3.22    10     10         1   tidyr   1.3.1
##  text3.24    16     16         1   tidyr   1.3.1
##  text3.25    21     25         1   tidyr   1.3.1
##  text3.26    22     26         2   tidyr   1.3.1
##  text3.27    26     29         2   tidyr   1.3.1
##  text3.28    40     42         1   tidyr   1.3.1
##  text3.29     8     10         1   tidyr   1.3.1
##  text3.30    10     12         1   tidyr   1.3.1
##   text4.1    24     24         1   readr   2.1.5
##   text4.2    62     84         3   readr   2.1.5
##   text4.4    17     20         1   readr   2.1.5
##   text4.5    12     14         1   readr   2.1.5
##   text4.7    15     15         1   readr   2.1.5
##   text4.8    51    180         1   readr   2.1.5
##   text4.9    13     13         1   readr   2.1.5
##  text4.11    27     30         1   readr   2.1.5
##  text4.12    14     14         1   readr   2.1.5
##  text4.13    33     64         1   readr   2.1.5
##  text4.14    68    109         4   readr   2.1.5
##  text4.15    16     16         1   readr   2.1.5
##  text4.16    96    255         5   readr   2.1.5
##  text4.17    57     86         5   readr   2.1.5
##  text4.18    15     42         1   readr   2.1.5
##  text4.19    38     51         2   readr   2.1.5
##  text4.20    79    158         4   readr   2.1.5
##  text4.21    11     13         1   readr   2.1.5
##  text4.23    26     33         2   readr   2.1.5
##  text4.24    46     75         3   readr   2.1.5
##  text4.25    57     96         4   readr   2.1.5
##  text4.27    22     24         2   readr   2.1.5
##  text4.29     9     10         1   readr   2.1.5
##  text4.30    18     20         1   readr   2.1.5
##  text4.31    14     14         1   readr   2.1.5
##  text4.32    16     16         1   readr   2.1.5
##  text4.33    13     14         1   readr   2.1.5
##  text4.34    20     24         2   readr   2.1.5
##  text4.36    17     23         2   readr   2.1.5
##  text4.37    10     10         1   readr   2.1.5
##  text4.38    21     27         1   readr   2.1.5
##  text4.39    20     26         1   readr   2.1.5
##                                         CRAN
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##  https://cloud.r-project.org/package=ggplot2
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=dplyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=tidyr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##    https://cloud.r-project.org/package=readr
##                                           Learn
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##  https://r4ds.had.co.nz/data-visualisation.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##            http://r4ds.had.co.nz/transform.html
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##                https://r4ds.hadley.nz/data-tidy
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html
##          http://r4ds.had.co.nz/data-import.html

6.8.6.3 Tokens

tokens() segments texts in a corpus into tokens (words or sentences) by word boundaries. We can remove punctuations or not

6.8.6.3.1 With punctuations
web_pages_txt_corpus_tok <- tokens(web_pages_txt_corpus)
web_pages_txt_corpus_tok
## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "to"           "content"      "ggplot23.5.2" "Get"         
##  [6] "started"      "Reference"    "News"         "Releases"     "Version"     
## [11] "3.5.0"        "Version"     
## [ ... and 765 more ]
## 
## text2 :
##  [1] "Skip"       "to"         "content"    "dplyr1.1.4" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Grouped"    "data"      
## [11] "Two-table"  "verbs"     
## [ ... and 1,246 more ]
## 
## text3 :
##  [1] "Skip"        "to"          "content"     "tidyr1.3.1"  "Tidy"       
##  [6] "data"        "Reference"   "Articles"    "Pivoting"    "Rectangling"
## [11] "Nested"      "data"       
## [ ... and 717 more ]
## 
## text4 :
##  [1] "Skip"       "to"         "content"    "readr2.1.5" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Column"     "type"      
## [11] "Locales"    "News"      
## [ ... and 1,733 more ]
## 
## text5 :
##  [1] "Skip"       "to"         "content"    "purrr1.1.0" "Reference" 
##  [6] "Articles"   "purrr"      "<"          "-"          ">"         
## [11] "base"       "R"         
## [ ... and 483 more ]
## 
## text6 :
##  [1] "Skip"        "to"          "content"     "tibble3.3.0" "Get"        
##  [6] "started"     "Reference"   "Articles"    "Column"      "types"      
## [11] "Controlling" "display"    
## [ ... and 705 more ]
## 
## [ reached max_ndoc ... 3 more documents ]
6.8.6.3.2 Without punctuations
web_pages_txt_corpus_tok_no_punct <- tokens(web_pages_txt_corpus, remove_punct = TRUE)
web_pages_txt_corpus_tok_no_punct
## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "to"           "content"      "ggplot23.5.2" "Get"         
##  [6] "started"      "Reference"    "News"         "Releases"     "Version"     
## [11] "3.5.0"        "Version"     
## [ ... and 635 more ]
## 
## text2 :
##  [1] "Skip"       "to"         "content"    "dplyr1.1.4" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Grouped"    "data"      
## [11] "Two-table"  "verbs"     
## [ ... and 988 more ]
## 
## text3 :
##  [1] "Skip"        "to"          "content"     "tidyr1.3.1"  "Tidy"       
##  [6] "data"        "Reference"   "Articles"    "Pivoting"    "Rectangling"
## [11] "Nested"      "data"       
## [ ... and 547 more ]
## 
## text4 :
##  [1] "Skip"       "to"         "content"    "readr2.1.5" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Column"     "type"      
## [11] "Locales"    "News"      
## [ ... and 1,364 more ]
## 
## text5 :
##  [1] "Skip"       "to"         "content"    "purrr1.1.0" "Reference" 
##  [6] "Articles"   "purrr"      "<"          ">"          "base"      
## [11] "R"          "Functional"
## [ ... and 376 more ]
## 
## text6 :
##  [1] "Skip"        "to"          "content"     "tibble3.3.0" "Get"        
##  [6] "started"     "Reference"   "Articles"    "Column"      "types"      
## [11] "Controlling" "display"    
## [ ... and 536 more ]
## 
## [ reached max_ndoc ... 3 more documents ]

6.8.6.4 Types

We can also generate types on the tokenised corpus (without punctuations)

web_pages_txt_corpus_tok_no_punct_types <- types(web_pages_txt_corpus_tok_no_punct)
web_pages_txt_corpus_tok_no_punct_types
##    [1] "Skip"                                        
##    [2] "to"                                          
##    [3] "content"                                     
##    [4] "ggplot23.5.2"                                
##    [5] "Get"                                         
##    [6] "started"                                     
##    [7] "Reference"                                   
##    [8] "News"                                        
##    [9] "Releases"                                    
##   [10] "Version"                                     
##   [11] "3.5.0"                                       
##   [12] "3.4.0"                                       
##   [13] "3.3.0"                                       
##   [14] "3.2.0"                                       
##   [15] "3.1.0"                                       
##   [16] "3.0.0"                                       
##   [17] "2.2.0"                                       
##   [18] "2.1.0"                                       
##   [19] "2.0.0"                                       
##   [20] "1.0.0"                                       
##   [21] "Changelog"                                   
##   [22] "Articles"                                    
##   [23] "Aesthetic"                                   
##   [24] "specifications"                              
##   [25] "Developer"                                   
##   [26] "Extending"                                   
##   [27] "ggplot2"                                     
##   [28] "Using"                                       
##   [29] "in"                                          
##   [30] "packages"                                    
##   [31] "Profiling"                                   
##   [32] "Performance"                                 
##   [33] "FAQ"                                         
##   [34] "Axes"                                        
##   [35] "Faceting"                                    
##   [36] "Customising"                                 
##   [37] "Annotation"                                  
##   [38] "Reordering"                                  
##   [39] "Barplots"                                    
##   [40] "Ask"                                         
##   [41] "AI"                                          
##   [42] "✨"                                          
##   [43] "Overview"                                    
##   [44] "is"                                          
##   [45] "a"                                           
##   [46] "system"                                      
##   [47] "for"                                         
##   [48] "declaratively"                               
##   [49] "creating"                                    
##   [50] "graphics"                                    
##   [51] "based"                                       
##   [52] "on"                                          
##   [53] "The"                                         
##   [54] "Grammar"                                     
##   [55] "of"                                          
##   [56] "Graphics"                                    
##   [57] "You"                                         
##   [58] "provide"                                     
##   [59] "the"                                         
##   [60] "data"                                        
##   [61] "tell"                                        
##   [62] "how"                                         
##   [63] "map"                                         
##   [64] "variables"                                   
##   [65] "aesthetics"                                  
##   [66] "what"                                        
##   [67] "graphical"                                   
##   [68] "primitives"                                  
##   [69] "use"                                         
##   [70] "and"                                         
##   [71] "it"                                          
##   [72] "takes"                                       
##   [73] "care"                                        
##   [74] "details"                                     
##   [75] "Installation"                                
##   [76] "easiest"                                     
##   [77] "way"                                         
##   [78] "get"                                         
##   [79] "install"                                     
##   [80] "whole"                                       
##   [81] "tidyverse"                                   
##   [82] "install.packages"                            
##   [83] "Alternatively"                               
##   [84] "just"                                        
##   [85] "Or"                                          
##   [86] "development"                                 
##   [87] "version"                                     
##   [88] "from"                                        
##   [89] "GitHub"                                      
##   [90] "pak"                                         
##   [91] "Cheatsheet"                                  
##   [92] "Usage"                                       
##   [93] "It’s"                                        
##   [94] "hard"                                        
##   [95] "succinctly"                                  
##   [96] "describe"                                    
##   [97] "works"                                       
##   [98] "because"                                     
##   [99] "embodies"                                    
##  [100] "deep"                                        
##  [101] "philosophy"                                  
##  [102] "visualisation"                               
##  [103] "However"                                     
##  [104] "most"                                        
##  [105] "cases"                                       
##  [106] "you"                                         
##  [107] "start"                                       
##  [108] "with"                                        
##  [109] "ggplot"                                      
##  [110] "supply"                                      
##  [111] "dataset"                                     
##  [112] "aesthetic"                                   
##  [113] "mapping"                                     
##  [114] "aes"                                         
##  [115] "then"                                        
##  [116] "add"                                         
##  [117] "layers"                                      
##  [118] "like"                                        
##  [119] "geom_point"                                  
##  [120] "or"                                          
##  [121] "geom_histogram"                              
##  [122] "scales"                                      
##  [123] "scale_colour_brewer"                         
##  [124] "faceting"                                    
##  [125] "facet_wrap"                                  
##  [126] "coordinate"                                  
##  [127] "systems"                                     
##  [128] "coord_flip"                                  
##  [129] "library"                                     
##  [130] "mpg"                                         
##  [131] "displ"                                       
##  [132] "hwy"                                         
##  [133] "colour"                                      
##  [134] "="                                           
##  [135] "class"                                       
##  [136] "+"                                           
##  [137] "Lifecycle"                                   
##  [138] "now"                                         
##  [139] "over"                                        
##  [140] "10"                                          
##  [141] "years"                                       
##  [142] "old"                                         
##  [143] "used"                                        
##  [144] "by"                                          
##  [145] "hundreds"                                    
##  [146] "thousands"                                   
##  [147] "people"                                      
##  [148] "make"                                        
##  [149] "millions"                                    
##  [150] "plots"                                       
##  [151] "That"                                        
##  [152] "means"                                       
##  [153] "by-and-large"                                
##  [154] "itself"                                      
##  [155] "changes"                                     
##  [156] "relatively"                                  
##  [157] "little"                                      
##  [158] "When"                                        
##  [159] "we"                                          
##  [160] "do"                                          
##  [161] "they"                                        
##  [162] "will"                                        
##  [163] "be"                                          
##  [164] "generally"                                   
##  [165] "new"                                         
##  [166] "functions"                                   
##  [167] "arguments"                                   
##  [168] "rather"                                      
##  [169] "than"                                        
##  [170] "changing"                                    
##  [171] "behaviour"                                   
##  [172] "existing"                                    
##  [173] "if"                                          
##  [174] "them"                                        
##  [175] "compelling"                                  
##  [176] "reasons"                                     
##  [177] "If"                                          
##  [178] "are"                                         
##  [179] "looking"                                     
##  [180] "innovation"                                  
##  [181] "look"                                        
##  [182] "s"                                           
##  [183] "rich"                                        
##  [184] "ecosystem"                                   
##  [185] "extensions"                                  
##  [186] "See"                                         
##  [187] "community"                                   
##  [188] "maintained"                                  
##  [189] "list"                                        
##  [190] "at"                                          
##  [191] "https://exts.ggplot2.tidyverse.org/gallery/."
##  [192] "Learning"                                    
##  [193] "better"                                      
##  [194] "off"                                         
##  [195] "starting"                                    
##  [196] "systematic"                                  
##  [197] "introduction"                                
##  [198] "trying"                                      
##  [199] "learn"                                       
##  [200] "reading"                                     
##  [201] "individual"                                  
##  [202] "documentation"                               
##  [203] "pages"                                       
##  [204] "Currently"                                   
##  [205] "there"                                       
##  [206] "three"                                       
##  [207] "good"                                        
##  [208] "places"                                      
##  [209] "Data"                                        
##  [210] "Visualization"                               
##  [211] "Communication"                               
##  [212] "chapters"                                    
##  [213] "R"                                           
##  [214] "Science"                                     
##  [215] "designed"                                    
##  [216] "give"                                        
##  [217] "comprehensive"                               
##  [218] "these"                                       
##  [219] "two"                                         
##  [220] "up"                                          
##  [221] "speed"                                       
##  [222] "essentials"                                  
##  [223] "as"                                          
##  [224] "quickly"                                     
##  [225] "possible"                                    
##  [226] "you’d"                                       
##  [227] "take"                                        
##  [228] "an"                                          
##  [229] "online"                                      
##  [230] "course"                                      
##  [231] "try"                                         
##  [232] "With"                                        
##  [233] "Kara"                                        
##  [234] "Woo"                                         
##  [235] "follow"                                      
##  [236] "webinar"                                     
##  [237] "Plotting"                                    
##  [238] "Anything"                                    
##  [239] "Thomas"                                      
##  [240] "Lin"                                         
##  [241] "Pedersen"                                    
##  [242] "want"                                        
##  [243] "dive"                                        
##  [244] "into"                                        
##  [245] "making"                                      
##  [246] "common"                                      
##  [247] "I"                                           
##  [248] "recommend"                                   
##  [249] "Cookbook"                                    
##  [250] "Winston"                                     
##  [251] "Chang"                                       
##  [252] "It"                                          
##  [253] "provides"                                    
##  [254] "set"                                         
##  [255] "recipes"                                     
##  [256] "solve"                                       
##  [257] "problems"                                    
##  [258] "you’ve"                                      
##  [259] "mastered"                                    
##  [260] "basics"                                      
##  [261] "more"                                        
##  [262] "read"                                        
##  [263] "Elegant"                                     
##  [264] "Analysis"                                    
##  [265] "describes"                                   
##  [266] "theoretical"                                 
##  [267] "underpinnings"                               
##  [268] "shows"                                       
##  [269] "all"                                         
##  [270] "pieces"                                      
##  [271] "fit"                                         
##  [272] "together"                                    
##  [273] "This"                                        
##  [274] "book"                                        
##  [275] "helps"                                       
##  [276] "understand"                                  
##  [277] "theory"                                      
##  [278] "that"                                        
##  [279] "underpins"                                   
##  [280] "help"                                        
##  [281] "create"                                      
##  [282] "types"                                       
##  [283] "specifically"                                
##  [284] "tailored"                                    
##  [285] "your"                                        
##  [286] "needs"                                       
##  [287] "Getting"                                     
##  [288] "There"                                       
##  [289] "main"                                        
##  [290] "RStudio"                                     
##  [291] "friendly"                                    
##  [292] "place"                                       
##  [293] "ask"                                         
##  [294] "any"                                         
##  [295] "questions"                                   
##  [296] "about"                                       
##  [297] "Stack"                                       
##  [298] "Overflow"                                    
##  [299] "great"                                       
##  [300] "source"                                      
##  [301] "answers"                                     
##  [302] "also"                                        
##  [303] "once"                                        
##  [304] "have"                                        
##  [305] "created"                                     
##  [306] "reproducible"                                
##  [307] "example"                                     
##  [308] "illustrates"                                 
##  [309] "problem"                                     
##  [310] "Links"                                       
##  [311] "View"                                        
##  [312] "CRAN"                                        
##  [313] "Browse"                                      
##  [314] "code"                                        
##  [315] "Report"                                      
##  [316] "bug"                                         
##  [317] "Learn"                                       
##  [318] "Extensions"                                  
##  [319] "🧩"                                          
##  [320] "License"                                     
##  [321] "Full"                                        
##  [322] "license"                                     
##  [323] "MIT"                                         
##  [324] "file"                                        
##  [325] "LICENSE"                                     
##  [326] "Community"                                   
##  [327] "Contributing"                                
##  [328] "guide"                                       
##  [329] "Code"                                        
##  [330] "conduct"                                     
##  [331] "Citation"                                    
##  [332] "Citing"                                      
##  [333] "Developers"                                  
##  [334] "Hadley"                                      
##  [335] "Wickham"                                     
##  [336] "Author"                                      
##  [337] "Lionel"                                      
##  [338] "Henry"                                       
##  [339] "maintainer"                                  
##  [340] "Kohske"                                      
##  [341] "Takahashi"                                   
##  [342] "Claus"                                       
##  [343] "Wilke"                                       
##  [344] "Hiroaki"                                     
##  [345] "Yutani"                                      
##  [346] "Dewey"                                       
##  [347] "Dunnington"                                  
##  [348] "Teun"                                        
##  [349] "van"                                         
##  [350] "den"                                         
##  [351] "Brand"                                       
##  [352] "Copyright"                                   
##  [353] "holder"                                      
##  [354] "funder"                                      
##  [355] "Developed"                                   
##  [356] "Site"                                        
##  [357] "built"                                       
##  [358] "pkgdown"                                     
##  [359] "2.1.3"                                       
##  [360] "dplyr1.1.4"                                  
##  [361] "Grouped"                                     
##  [362] "Two-table"                                   
##  [363] "verbs"                                       
##  [364] "dplyr"                                       
##  [365] "<"                                           
##  [366] ">"                                           
##  [367] "base"                                        
##  [368] "Automation"                                  
##  [369] "Column-wise"                                 
##  [370] "operations"                                  
##  [371] "Row-wise"                                    
##  [372] "Programming"                                 
##  [373] "More"                                        
##  [374] "articles"                                    
##  [375] "1.1.0"                                       
##  [376] "0.8.3"                                       
##  [377] "0.8.2"                                       
##  [378] "0.8.1"                                       
##  [379] "0.8.0"                                       
##  [380] "0.7.5"                                       
##  [381] "grammar"                                     
##  [382] "manipulation"                                
##  [383] "providing"                                   
##  [384] "consistent"                                  
##  [385] "challenges"                                  
##  [386] "mutate"                                      
##  [387] "adds"                                        
##  [388] "select"                                      
##  [389] "picks"                                       
##  [390] "their"                                       
##  [391] "names"                                       
##  [392] "filter"                                      
##  [393] "values"                                      
##  [394] "summarise"                                   
##  [395] "reduces"                                     
##  [396] "multiple"                                    
##  [397] "down"                                        
##  [398] "single"                                      
##  [399] "summary"                                     
##  [400] "arrange"                                     
##  [401] "ordering"                                    
##  [402] "rows"                                        
##  [403] "These"                                       
##  [404] "combine"                                     
##  [405] "naturally"                                   
##  [406] "group_by"                                    
##  [407] "which"                                       
##  [408] "allows"                                      
##  [409] "perform"                                     
##  [410] "operation"                                   
##  [411] "group"                                       
##  [412] "can"                                         
##  [413] "vignette"                                    
##  [414] "As"                                          
##  [415] "well"                                        
##  [416] "single-table"                                
##  [417] "variety"                                     
##  [418] "two-table"                                   
##  [419] "best"                                        
##  [420] "transformation"                              
##  [421] "chapter"                                     
##  [422] "Backends"                                    
##  [423] "In"                                          
##  [424] "addition"                                    
##  [425] "frames"                                      
##  [426] "tibbles"                                     
##  [427] "makes"                                       
##  [428] "working"                                     
##  [429] "other"                                       
##  [430] "computational"                               
##  [431] "backends"                                    
##  [432] "accessible"                                  
##  [433] "efficient"                                   
##  [434] "Below"                                       
##  [435] "alternative"                                 
##  [436] "arrow"                                       
##  [437] "larger-than-memory"                          
##  [438] "datasets"                                    
##  [439] "including"                                   
##  [440] "remote"                                      
##  [441] "cloud"                                       
##  [442] "storage"                                     
##  [443] "AWS"                                         
##  [444] "S3"                                          
##  [445] "using"                                       
##  [446] "Apache"                                      
##  [447] "Arrow"                                       
##  [448] "C"                                           
##  [449] "engine"                                      
##  [450] "Acero"                                       
##  [451] "dtplyr"                                      
##  [452] "large"                                       
##  [453] "in-memory"                                   
##  [454] "Translates"                                  
##  [455] "high"                                        
##  [456] "performance"                                 
##  [457] "data.table"                                  
##  [458] "dbplyr"                                      
##  [459] "stored"                                      
##  [460] "relational"                                  
##  [461] "database"                                    
##  [462] "SQL"                                         
##  [463] "duckplyr"                                    
##  [464] "duckdb"                                      
##  [465] "zero"                                        
##  [466] "extra"                                       
##  [467] "copies"                                      
##  [468] "queries"                                     
##  [469] "automatic"                                   
##  [470] "fallback"                                    
##  [471] "when"                                        
##  [472] "translation"                                 
##  [473] "isn’t"                                       
##  [474] "still"                                       
##  [475] "small"                                       
##  [476] "enough"                                      
##  [477] "computer"                                    
##  [478] "sparklyr"                                    
##  [479] "very"                                        
##  [480] "Spark"                                       
##  [481] "Development"                                 
##  [482] "To"                                          
##  [483] "fix"                                         
##  [484] "feature"                                     
##  [485] "Cheat"                                       
##  [486] "Sheet"                                       
##  [487] "starwars"                                    
##  [488] "species"                                     
##  [489] "Droid"                                       
##  [490] "A"                                           
##  [491] "tibble"                                      
##  [492] "6"                                           
##  [493] "×"                                           
##  [494] "14"                                          
##  [495] "name"                                        
##  [496] "height"                                      
##  [497] "mass"                                        
##  [498] "hair_color"                                  
##  [499] "skin_color"                                  
##  [500] "eye_color"                                   
##  [501] "birth_year"                                  
##  [502] "sex"                                         
##  [503] "gender"                                      
##  [504] "chr"                                         
##  [505] "int"                                         
##  [506] "dbl"                                         
##  [507] "1"                                           
##  [508] "C-3PO"                                       
##  [509] "167"                                         
##  [510] "75"                                          
##  [511] "NA"                                          
##  [512] "gold"                                        
##  [513] "yellow"                                      
##  [514] "112"                                         
##  [515] "none"                                        
##  [516] "masculi"                                     
##  [517] "2"                                           
##  [518] "R2-D2"                                       
##  [519] "96"                                          
##  [520] "32"                                          
##  [521] "white"                                       
##  [522] "blue"                                        
##  [523] "red"                                         
##  [524] "33"                                          
##  [525] "3"                                           
##  [526] "R5-D4"                                       
##  [527] "97"                                          
##  [528] "4"                                           
##  [529] "IG-88"                                       
##  [530] "200"                                         
##  [531] "140"                                         
##  [532] "metal"                                       
##  [533] "15"                                          
##  [534] "5"                                           
##  [535] "R4-P17"                                      
##  [536] "silver"                                      
##  [537] "feminine"                                    
##  [538] "ℹ"                                           
##  [539] "row"                                         
##  [540] "homeworld"                                   
##  [541] "films"                                       
##  [542] "vehicles"                                    
##  [543] "starships"                                   
##  [544] "ends_with"                                   
##  [545] "color"                                       
##  [546] "87"                                          
##  [547] "Luke"                                        
##  [548] "Skywalker"                                   
##  [549] "blond"                                       
##  [550] "fair"                                        
##  [551] "Darth"                                       
##  [552] "Vader"                                       
##  [553] "Leia"                                        
##  [554] "Organa"                                      
##  [555] "brown"                                       
##  [556] "light"                                       
##  [557] "82"                                          
##  [558] "bmi"                                         
##  [559] "100"                                         
##  [560] "^"                                           
##  [561] "172"                                         
##  [562] "77"                                          
##  [563] "26.0"                                        
##  [564] "26.9"                                        
##  [565] "34.7"                                        
##  [566] "202"                                         
##  [567] "136"                                         
##  [568] "33.3"                                        
##  [569] "150"                                         
##  [570] "49"                                          
##  [571] "21.8"                                        
##  [572] "desc"                                        
##  [573] "Jabba"                                       
##  [574] "De"                                          
##  [575] "175"                                         
##  [576] "1358"                                        
##  [577] "green-tan"                                   
##  [578] "orange"                                      
##  [579] "600"                                         
##  [580] "herm"                                        
##  [581] "mascu"                                       
##  [582] "Grievous"                                    
##  [583] "216"                                         
##  [584] "159"                                         
##  [585] "wh"                                          
##  [586] "green"                                       
##  [587] "y"                                           
##  [588] "male"                                        
##  [589] "Va"                                          
##  [590] "41.9"                                        
##  [591] "Tarfful"                                     
##  [592] "234"                                         
##  [593] "n"                                           
##  [594] "mean"                                        
##  [595] "na.rm"                                       
##  [596] "TRUE"                                        
##  [597] "50"                                          
##  [598] "9"                                           
##  [599] "69.8"                                        
##  [600] "Gungan"                                      
##  [601] "74"                                          
##  [602] "Human"                                       
##  [603] "35"                                          
##  [604] "81.3"                                        
##  [605] "Kaminoan"                                    
##  [606] "88"                                          
##  [607] "Mirialan"                                    
##  [608] "53.1"                                        
##  [609] "encounter"                                   
##  [610] "clear"                                       
##  [611] "please"                                      
##  [612] "issue"                                       
##  [613] "minimal"                                     
##  [614] "For"                                         
##  [615] "discussion"                                  
##  [616] "community.rstudio.com"                       
##  [617] "manipulatr"                                  
##  [618] "mailing"                                     
##  [619] "Please"                                      
##  [620] "note"                                        
##  [621] "this"                                        
##  [622] "project"                                     
##  [623] "released"                                    
##  [624] "Contributor"                                 
##  [625] "Conduct"                                     
##  [626] "By"                                          
##  [627] "participating"                               
##  [628] "agree"                                       
##  [629] "abide"                                       
##  [630] "its"                                         
##  [631] "terms"                                       
##  [632] "Romain"                                      
##  [633] "François"                                    
##  [634] "Kirill"                                      
##  [635] "Müller"                                      
##  [636] "Davis"                                       
##  [637] "Vaughan"                                     
##  [638] "tidyr1.3.1"                                  
##  [639] "Tidy"                                        
##  [640] "Pivoting"                                    
##  [641] "Rectangling"                                 
##  [642] "Nested"                                      
##  [643] "1.3.0"                                       
##  [644] "1.2.0"                                       
##  [645] "0.7.0"                                       
##  [646] "0.6.0"                                       
##  [647] "0.5.0"                                       
##  [648] "0.4.0"                                       
##  [649] "0.3.0"                                       
##  [650] "0.2.0"                                       
##  [651] "0.1.0"                                       
##  [652] "tidyr"                                       
##  [653] "goal"                                        
##  [654] "tidy"                                        
##  [655] "where"                                       
##  [656] "Each"                                        
##  [657] "variable"                                    
##  [658] "column"                                      
##  [659] "each"                                        
##  [660] "observation"                                 
##  [661] "value"                                       
##  [662] "cell"                                        
##  [663] "standard"                                    
##  [664] "storing"                                     
##  [665] "wherever"                                    
##  [666] "throughout"                                  
##  [667] "ensure"                                      
##  [668] "you’ll"                                      
##  [669] "spend"                                       
##  [670] "less"                                        
##  [671] "time"                                        
##  [672] "fighting"                                    
##  [673] "tools"                                       
##  [674] "analysis"                                    
##  [675] "tidy-data"                                   
##  [676] "fall"                                        
##  [677] "five"                                        
##  [678] "categories"                                  
##  [679] "converts"                                    
##  [680] "between"                                     
##  [681] "long"                                        
##  [682] "wide"                                        
##  [683] "forms"                                       
##  [684] "introduces"                                  
##  [685] "pivot_longer"                                
##  [686] "pivot_wider"                                 
##  [687] "replacing"                                   
##  [688] "older"                                       
##  [689] "spread"                                      
##  [690] "gather"                                      
##  [691] "pivot"                                       
##  [692] "turns"                                       
##  [693] "deeply"                                      
##  [694] "nested"                                      
##  [695] "lists"                                       
##  [696] "JSON"                                        
##  [697] "unnest_longer"                               
##  [698] "unnest_wider"                                
##  [699] "hoist"                                       
##  [700] "rectangle"                                   
##  [701] "Nesting"                                     
##  [702] "grouped"                                     
##  [703] "form"                                        
##  [704] "becomes"                                     
##  [705] "containing"                                  
##  [706] "frame"                                       
##  [707] "unnesting"                                   
##  [708] "does"                                        
##  [709] "opposite"                                    
##  [710] "nest"                                        
##  [711] "unnest"                                      
##  [712] "Splitting"                                   
##  [713] "combining"                                   
##  [714] "character"                                   
##  [715] "columns"                                     
##  [716] "Use"                                         
##  [717] "separate_wider_delim"                        
##  [718] "separate_wider_position"                     
##  [719] "separate_wider_regex"                        
##  [720] "pull"                                        
##  [721] "unite"                                       
##  [722] "Make"                                        
##  [723] "implicit"                                    
##  [724] "missing"                                     
##  [725] "explicit"                                    
##  [726] "complete"                                    
##  [727] "drop_na"                                     
##  [728] "replace"                                     
##  [729] "next"                                        
##  [730] "previous"                                    
##  [731] "fill"                                        
##  [732] "known"                                       
##  [733] "replace_na"                                  
##  [734] "Related"                                     
##  [735] "work"                                        
##  [736] "supersedes"                                  
##  [737] "reshape2"                                    
##  [738] "2010-2014"                                   
##  [739] "reshape"                                     
##  [740] "2005-2010"                                   
##  [741] "Somewhat"                                    
##  [742] "counterintuitively"                          
##  [743] "iteration"                                   
##  [744] "package"                                     
##  [745] "has"                                         
##  [746] "done"                                        
##  [747] "tidying"                                     
##  [748] "not"                                         
##  [749] "general"                                     
##  [750] "reshaping"                                   
##  [751] "aggregation"                                 
##  [752] "high-performance"                            
##  [753] "implementations"                             
##  [754] "melt"                                        
##  [755] "dcast"                                       
##  [756] "CS"                                          
##  [757] "perspective"                                 
##  [758] "I’d"                                         
##  [759] "following"                                   
##  [760] "papers"                                      
##  [761] "Wrangler"                                    
##  [762] "Interactive"                                 
##  [763] "visual"                                      
##  [764] "specification"                               
##  [765] "scripts"                                     
##  [766] "An"                                          
##  [767] "interactive"                                 
##  [768] "framework"                                   
##  [769] "cleaning"                                    
##  [770] "Potter’s"                                    
##  [771] "wheel"                                       
##  [772] "On"                                          
##  [773] "efficiently"                                 
##  [774] "implementing"                                
##  [775] "SchemaSQL"                                   
##  [776] "here’s"                                      
##  [777] "terminology"                                 
##  [778] "different"                                   
##  [779] "longer"                                      
##  [780] "wider"                                       
##  [781] "cast"                                        
##  [782] "spreadsheets"                                
##  [783] "unpivot"                                     
##  [784] "databases"                                   
##  [785] "fold"                                        
##  [786] "unfold"                                      
##  [787] "github"                                      
##  [788] "contributing"                                
##  [789] "Maximilian"                                  
##  [790] "Girlich"                                     
##  [791] "authors"                                     
##  [792] "readr2.1.5"                                  
##  [793] "Column"                                      
##  [794] "type"                                        
##  [795] "Locales"                                     
##  [796] "2.1.0Version"                                
##  [797] "2.0.0Version"                                
##  [798] "1.4.0Version"                                
##  [799] "1.3.1Version"                                
##  [800] "1.0.0Version"                                
##  [801] "0.2.0Version"                                
##  [802] "readr"                                       
##  [803] "fast"                                        
##  [804] "rectangular"                                 
##  [805] "delimited"                                   
##  [806] "files"                                       
##  [807] "such"                                        
##  [808] "comma-separated"                             
##  [809] "CSV"                                         
##  [810] "tab-separated"                               
##  [811] "TSV"                                         
##  [812] "parse"                                       
##  [813] "many"                                        
##  [814] "found"                                       
##  [815] "wild"                                        
##  [816] "while"                                       
##  [817] "informative"                                 
##  [818] "report"                                      
##  [819] "parsing"                                     
##  [820] "leads"                                       
##  [821] "unexpected"                                  
##  [822] "results"                                     
##  [823] "import"                                      
##  [824] "part"                                        
##  [825] "core"                                        
##  [826] "so"                                          
##  [827] "load"                                        
##  [828] "─"                                           
##  [829] "Attaching"                                   
##  [830] "✔"                                           
##  [831] "1.1.4"                                       
##  [832] "2.1.4.9000"                                  
##  [833] "forcats"                                     
##  [834] "stringr"                                     
##  [835] "1.5.1"                                       
##  [836] "3.4.3"                                       
##  [837] "3.2.1"                                       
##  [838] "lubridate"                                   
##  [839] "1.9.3"                                       
##  [840] "purrr"                                       
##  [841] "1.0.2"                                       
##  [842] "Conflicts"                                   
##  [843] "tidyverse_conflicts"                         
##  [844] "✖"                                           
##  [845] "masks"                                       
##  [846] "stats"                                       
##  [847] "lag"                                         
##  [848] "conflicted"                                  
##  [849] "http://conflicted.r-lib.org/"                
##  [850] "force"                                       
##  [851] "conflicts"                                   
##  [852] "become"                                      
##  [853] "errors"                                      
##  [854] "Of"                                          
##  [855] "function"                                    
##  [856] "parses"                                      
##  [857] "lines"                                       
##  [858] "fields"                                      
##  [859] "supports"                                    
##  [860] "formats"                                     
##  [861] "read_"                                       
##  [862] "read_csv"                                    
##  [863] "read_tsv"                                    
##  [864] "read_csv2"                                   
##  [865] "semicolon-separated"                         
##  [866] "decimal"                                     
##  [867] "mark"                                        
##  [868] "read_delim"                                  
##  [869] "important"                                   
##  [870] "special"                                     
##  [871] "read_fwf"                                    
##  [872] "fixed-width"                                 
##  [873] "read_table"                                  
##  [874] "whitespace-separated"                        
##  [875] "read_log"                                    
##  [876] "web"                                         
##  [877] "log"                                         
##  [878] "should"                                      
##  [879] "converted"                                   
##  [880] "vector"                                      
##  [881] "specific"                                    
##  [882] "e.g"                                         
##  [883] "numeric"                                     
##  [884] "datetime"                                    
##  [885] "etc"                                         
##  [886] "absence"                                     
##  [887] "guess"                                       
##  [888] "column-types"                                
##  [889] "gives"                                       
##  [890] "detail"                                      
##  [891] "guesses"                                     
##  [892] "guessing"                                    
##  [893] "handy"                                       
##  [894] "especially"                                  
##  [895] "during"                                      
##  [896] "exploration"                                 
##  [897] "but"                                         
##  [898] "it’s"                                        
##  [899] "remember"                                    
##  [900] "matures"                                     
##  [901] "past"                                        
##  [902] "exploratory"                                 
##  [903] "phase"                                       
##  [904] "strategy"                                    
##  [905] "loads"                                       
##  [906] "sample"                                      
##  [907] "bundled"                                     
##  [908] "chickens"                                    
##  [909] "readr_example"                               
##  [910] "chickens.csv"                                
##  [911] "Rows"                                        
##  [912] "Columns"                                     
##  [913] "Delimiter"                                   
##  [914] "chicken"                                     
##  [915] "motto"                                       
##  [916] "eggs_laid"                                   
##  [917] "`"                                           
##  [918] "spec"                                        
##  [919] "retrieve"                                    
##  [920] "full"                                        
##  [921] "Specify"                                     
##  [922] "show_col_types"                              
##  [923] "FALSE"                                       
##  [924] "quiet"                                       
##  [925] "message"                                     
##  [926] "Foghorn"                                     
##  [927] "Leghorn"                                     
##  [928] "rooster"                                     
##  [929] "0"                                           
##  [930] "That's"                                      
##  [931] "joke"                                        
##  [932] "ah"                                          
##  [933] "say"                                         
##  [934] "that's"                                      
##  [935] "jok"                                         
##  [936] "Chicken"                                     
##  [937] "Little"                                      
##  [938] "hen"                                         
##  [939] "sky"                                         
##  [940] "falling"                                     
##  [941] "Ginger"                                      
##  [942] "12"                                          
##  [943] "Listen"                                      
##  [944] "We'll"                                       
##  [945] "either"                                      
##  [946] "die"                                         
##  [947] "free"                                        
##  [948] "chick"                                       
##  [949] "Camilla"                                     
##  [950] "7"                                           
##  [951] "Bawk"                                        
##  [952] "buck"                                        
##  [953] "ba-gawk"                                     
##  [954] "Ernie"                                       
##  [955] "Giant"                                       
##  [956] "Put"                                         
##  [957] "Captain"                                     
##  [958] "Solo"                                        
##  [959] "cargo"                                       
##  [960] "hold"                                        
##  [961] "Note"                                        
##  [962] "prints"                                      
##  [963] "guessed"                                     
##  [964] "case"                                        
##  [965] "useful"                                      
##  [966] "check"                                       
##  [967] "been"                                        
##  [968] "expect"                                      
##  [969] "haven’t"                                     
##  [970] "need"                                        
##  [971] "sounds"                                      
##  [972] "lot"                                         
##  [973] "trouble"                                     
##  [974] "luckily"                                     
##  [975] "affords"                                     
##  [976] "nice"                                        
##  [977] "workflow"                                    
##  [978] "initial"                                     
##  [979] "effort"                                      
##  [980] "cols"                                        
##  [981] "col_character"                               
##  [982] "col_double"                                  
##  [983] "Now"                                         
##  [984] "copy"                                        
##  [985] "paste"                                       
##  [986] "tweak"                                       
##  [987] "call"                                        
##  [988] "expresses"                                   
##  [989] "desired"                                     
##  [990] "Here"                                        
##  [991] "express"                                     
##  [992] "factor"                                      
##  [993] "levels"                                      
##  [994] "order"                                       
##  [995] "integer"                                     
##  [996] "col_types"                                   
##  [997] "col_factor"                                  
##  [998] "c"                                           
##  [999] "col_integer"                                 
## [1000] "fct"                                         
## [1001] "expanded"                                    
## [1002] "Editions"                                    
## [1003] "got"                                         
## [1004] "July"                                        
## [1005] "2021"                                        
## [1006] "so-called"                                   
## [1007] "second"                                      
## [1008] "edition"                                     
## [1009] "calls"                                       
## [1010] "vroom"                                       
## [1011] "default"                                     
## [1012] "versions"                                    
## [1013] "prior"                                       
## [1014] "called"                                      
## [1015] "first"                                       
## [1016] "you’re"                                      
## [1017] "access"                                      
## [1018] "via"                                         
## [1019] "with_edition"                                
## [1020] "local_edition"                               
## [1021] "And"                                         
## [1022] "obviously"                                   
## [1023] "definition"                                  
## [1024] "that’s"                                      
## [1025] "We"                                          
## [1026] "continue"                                    
## [1027] "support"                                     
## [1028] "number"                                      
## [1029] "releases"                                    
## [1030] "overall"                                     
## [1031] "uniformly"                                   
## [1032] "Therefore"                                   
## [1033] "plan"                                        
## [1034] "eventually"                                  
## [1035] "deprecate"                                   
## [1036] "remove"                                      
## [1037] "New"                                         
## [1038] "actively-maintained"                         
## [1039] "workarounds"                                 
## [1040] "offered"                                     
## [1041] "pragmatic"                                   
## [1042] "patch"                                       
## [1043] "legacy"                                      
## [1044] "temporary"                                   
## [1045] "solution"                                    
## [1046] "infelicities"                                
## [1047] "identified"                                  
## [1048] "Alternatives"                                
## [1049] "alternatives"                                
## [1050] "data.table’s"                                
## [1051] "fread"                                       
## [1052] "differences"                                 
## [1053] "discussed"                                   
## [1054] "below"                                       
## [1055] "Base"                                        
## [1056] "Compared"                                    
## [1057] "corresponding"                               
## [1058] "naming"                                      
## [1059] "scheme"                                      
## [1060] "parameters"                                  
## [1061] "col_names"                                   
## [1062] "header"                                      
## [1063] "colClasses"                                  
## [1064] "Are"                                         
## [1065] "much"                                        
## [1066] "faster"                                      
## [1067] "10x-100x"                                    
## [1068] "depending"                                   
## [1069] "Leave"                                       
## [1070] "strings"                                     
## [1071] "automatically"                               
## [1072] "date"                                        
## [1073] "Have"                                        
## [1074] "helpful"                                     
## [1075] "progress"                                    
## [1076] "bar"                                         
## [1077] "loading"                                     
## [1078] "going"                                       
## [1079] "All"                                         
## [1080] "exactly"                                     
## [1081] "same"                                        
## [1082] "regardless"                                  
## [1083] "current"                                     
## [1084] "locale"                                      
## [1085] "override"                                    
## [1086] "US-centric"                                  
## [1087] "defaults"                                    
## [1088] "similar"                                     
## [1089] "sometimes"                                   
## [1090] "slower"                                      
## [1091] "particularly"                                
## [1092] "heavy"                                       
## [1093] "Can"                                         
## [1094] "some"                                        
## [1095] "basically"                                   
## [1096] "encourage"                                   
## [1097] "delimiter"                                   
## [1098] "skipped"                                     
## [1099] "Follow"                                      
## [1100] "tidyverse-wide"                              
## [1101] "conventions"                                 
## [1102] "returning"                                   
## [1103] "approach"                                    
## [1104] "repair"                                      
## [1105] "mini-language"                               
## [1106] "selection"                                   
## [1107] "Acknowledgements"                            
## [1108] "Thanks"                                      
## [1109] "Joe"                                         
## [1110] "Cheng"                                       
## [1111] "showing"                                     
## [1112] "me"                                          
## [1113] "beauty"                                      
## [1114] "deterministic"                               
## [1115] "finite"                                      
## [1116] "automata"                                    
## [1117] "teaching"                                    
## [1118] "why"                                         
## [1119] "write"                                       
## [1120] "tokenizer"                                   
## [1121] "JJ"                                          
## [1122] "Allaire"                                     
## [1123] "helping"                                     
## [1124] "come"                                        
## [1125] "design"                                      
## [1126] "few"                                         
## [1127] "easy"                                        
## [1128] "extend"                                      
## [1129] "Dirk"                                        
## [1130] "Eddelbuettel"                                
## [1131] "coming"                                      
## [1132] "Jim"                                         
## [1133] "Hester"                                      
## [1134] "Jennifer"                                    
## [1135] "Bryan"                                       
## [1136] "2.0.7"                                       
## [1137] "purrr1.1.0"                                  
## [1138] "Functional"                                  
## [1139] "programming"                                 
## [1140] "languages"                                   
## [1141] "0.2.3"                                       
## [1142] "enhances"                                    
## [1143] "R’s"                                         
## [1144] "functional"                                  
## [1145] "FP"                                          
## [1146] "toolkit"                                     
## [1147] "vectors"                                     
## [1148] "never"                                       
## [1149] "heard"                                       
## [1150] "before"                                      
## [1151] "family"                                      
## [1152] "allow"                                       
## [1153] "loops"                                       
## [1154] "both"                                        
## [1155] "succinct"                                    
## [1156] "easier"                                      
## [1157] "uses"                                        
## [1158] "fairly"                                      
## [1159] "realistic"                                   
## [1160] "split"                                       
## [1161] "model"                                       
## [1162] "piece"                                       
## [1163] "compute"                                     
## [1164] "extract"                                     
## [1165] "R2"                                          
## [1166] "mtcars"                                      
## [1167] "|"                                           
## [1168] "$"                                           
## [1169] "cyl"                                         
## [1170] "df"                                          
## [1171] "lm"                                          
## [1172] "~"                                           
## [1173] "wt"                                          
## [1174] "map_dbl"                                     
## [1175] "r.squared"                                   
## [1176] "8"                                           
## [1177] "0.5086326"                                   
## [1178] "0.4645102"                                   
## [1179] "0.4229655"                                   
## [1180] "advantages"                                  
## [1181] "equivalents"                                 
## [1182] "argument"                                    
## [1183] "always"                                      
## [1184] "pipe"                                        
## [1185] "type-stable"                                 
## [1186] "They"                                        
## [1187] "return"                                      
## [1188] "advertised"                                  
## [1189] "output"                                      
## [1190] "returns"                                     
## [1191] "double"                                      
## [1192] "throw"                                       
## [1193] "error"                                       
## [1194] "accept"                                      
## [1195] "named"                                       
## [1196] "anonymous"                                   
## [1197] "lambda"                                      
## [1198] "components"                                  
## [1199] "position"                                    
## [1200] "obvious"                                     
## [1201] "easily"                                      
## [1202] "track"                                       
## [1203] "running"                                     
## [1204] "jobs"                                        
## [1205] "in_parallel"                                 
## [1206] "computation"                                 
## [1207] "across"                                      
## [1208] "cores"                                       
## [1209] "machines"                                    
## [1210] "network"                                     
## [1211] "tibble3.3.0"                                 
## [1212] "Controlling"                                 
## [1213] "display"                                     
## [1214] "numbers"                                     
## [1215] "Comparing"                                   
## [1216] "Invariants"                                  
## [1217] "behavior"                                    
## [1218] "2.1.1"                                       
## [1219] "2.0.1"                                       
## [1220] "pre-announcement"                            
## [1221] "1.4.2"                                       
## [1222] "1.4.1"                                       
## [1223] "tbl_df"                                      
## [1224] "modern"                                      
## [1225] "reimagining"                                 
## [1226] "data.frame"                                  
## [1227] "keeping"                                     
## [1228] "proven"                                      
## [1229] "effective"                                   
## [1230] "throwing"                                    
## [1231] "out"                                         
## [1232] "Tibbles"                                     
## [1233] "data.frames"                                 
## [1234] "lazy"                                        
## [1235] "surly"                                       
## [1236] "i.e"                                         
## [1237] "don’t"                                       
## [1238] "change"                                      
## [1239] "partial"                                     
## [1240] "matching"                                    
## [1241] "complain"                                    
## [1242] "exist"                                       
## [1243] "forces"                                      
## [1244] "confront"                                    
## [1245] "earlier"                                     
## [1246] "typically"                                   
## [1247] "leading"                                     
## [1248] "cleaner"                                     
## [1249] "expressive"                                  
## [1250] "enhanced"                                    
## [1251] "print"                                       
## [1252] "method"                                      
## [1253] "complex"                                     
## [1254] "objects"                                     
## [1255] "science"                                     
## [1256] "Create"                                      
## [1257] "object"                                      
## [1258] "as_tibble"                                   
## [1259] "b"                                           
## [1260] "letters"                                     
## [1261] "Sys.Date"                                    
## [1262] "2025-03-18"                                  
## [1263] "2025-03-17"                                  
## [1264] "2025-03-16"                                  
## [1265] "reasonable"                                  
## [1266] "inputs"                                      
## [1267] "already"                                     
## [1268] "matrices"                                    
## [1269] "tables"                                      
## [1270] "x"                                           
## [1271] "z"                                           
## [1272] "17"                                          
## [1273] "26"                                          
## [1274] "keeps"                                       
## [1275] "only"                                        
## [1276] "recycles"                                    
## [1277] "length"                                      
## [1278] "creates"                                     
## [1279] "row.names"                                   
## [1280] "features"                                    
## [1281] "define"                                      
## [1282] "row-by-row"                                  
## [1283] "tribble"                                     
## [1284] "3.6"                                         
## [1285] "8.5"                                         
## [1286] "draws"                                       
## [1287] "inspiration"                                 
## [1288] "Like"                                        
## [1289] "doesn’t"                                     
## [1290] "rownames"                                    
## [1291] "2.1.3.9000"                                  
## [1292] "stringr1.5.1"                                
## [1293] "From"                                        
## [1294] "Regular"                                     
## [1295] "expressions"                                 
## [1296] "1.5.0Version"                                
## [1297] "1.3.0Version"                                
## [1298] "1.2.0Version"                                
## [1299] "1.1.0Version"                                
## [1300] "Strings"                                     
## [1301] "glamorous"                                   
## [1302] "high-profile"                                
## [1303] "play"                                        
## [1304] "big"                                         
## [1305] "role"                                        
## [1306] "preparation"                                 
## [1307] "tasks"                                       
## [1308] "cohesive"                                    
## [1309] "familiar"                                    
## [1310] "top"                                         
## [1311] "stringi"                                     
## [1312] "ICU"                                         
## [1313] "correct"                                     
## [1314] "string"                                      
## [1315] "manipulations"                               
## [1316] "focusses"                                    
## [1317] "commonly"                                    
## [1318] "whereas"                                     
## [1319] "covering"                                    
## [1320] "almost"                                      
## [1321] "anything"                                    
## [1322] "imagine"                                     
## [1323] "find"                                        
## [1324] "Both"                                        
## [1325] "share"                                       
## [1326] "similarly"                                   
## [1327] "str_"                                        
## [1328] "video"                                       
## [1329] "cross"                                       
## [1330] "deal"                                        
## [1331] "authority"                                   
## [1332] "str_length"                                  
## [1333] "str_c"                                       
## [1334] "collapse"                                    
## [1335] "str_sub"                                     
## [1336] "vi"                                          
## [1337] "cr"                                          
## [1338] "ex"                                          
## [1339] "de"                                          
## [1340] "au"                                          
## [1341] "Most"                                        
## [1342] "regular"                                     
## [1343] "concise"                                     
## [1344] "language"                                    
## [1345] "describing"                                  
## [1346] "patterns"                                    
## [1347] "text"                                        
## [1348] "expression"                                  
## [1349] "aeiou"                                       
## [1350] "matches"                                     
## [1351] "vowel"                                       
## [1352] "str_subset"                                  
## [1353] "str_count"                                   
## [1354] "seven"                                       
## [1355] "str_detect"                                  
## [1356] "pattern"                                     
## [1357] "tells"                                       
## [1358] "there’s"                                     
## [1359] "match"                                       
## [1360] "counts"                                      
## [1361] "extracts"                                    
## [1362] "str_locate"                                  
## [1363] "end"                                         
## [1364] "str_extract"                                 
## [1365] "i"                                           
## [1366] "o"                                           
## [1367] "e"                                           
## [1368] "str_match"                                   
## [1369] "parts"                                       
## [1370] "defined"                                     
## [1371] "parentheses"                                 
## [1372] "characters"                                  
## [1373] "side"                                        
## [1374] "vid"                                         
## [1375] "v"                                           
## [1376] "d"                                           
## [1377] "ros"                                         
## [1378] "r"                                           
## [1379] "dea"                                         
## [1380] "aut"                                         
## [1381] "t"                                           
## [1382] "str_replace"                                 
## [1383] "replacement"                                 
## [1384] "replaces"                                    
## [1385] "deo"                                         
## [1386] "ss"                                          
## [1387] "xtra"                                        
## [1388] "al"                                          
## [1389] "uthority"                                    
## [1390] "str_split"                                   
## [1391] "splits"                                      
## [1392] "engines"                                     
## [1393] "fixed"                                       
## [1394] "exact"                                       
## [1395] "bytes"                                       
## [1396] "coll"                                        
## [1397] "human"                                       
## [1398] "boundary"                                    
## [1399] "boundaries"                                  
## [1400] "Addin"                                       
## [1401] "RegExplain"                                  
## [1402] "addin"                                       
## [1403] "interface"                                   
## [1404] "interactively"                               
## [1405] "build"                                       
## [1406] "regexp"                                      
## [1407] "consult"                                     
## [1408] "included"                                    
## [1409] "resources"                                   
## [1410] "installed"                                   
## [1411] "devtools"                                    
## [1412] "install_github"                              
## [1413] "gadenbuie"                                   
## [1414] "regexplain"                                  
## [1415] "solid"                                       
## [1416] "grown"                                       
## [1417] "organically"                                 
## [1418] "inconsistent"                                
## [1419] "Additionally"                                
## [1420] "behind"                                      
## [1421] "things"                                      
## [1422] "Ruby"                                        
## [1423] "Python"                                      
## [1424] "Uses"                                        
## [1425] "modify"                                      
## [1426] "conjunction"                                 
## [1427] "str_pad"                                     
## [1428] "right"                                       
## [1429] "11"                                          
## [1430] "f"                                           
## [1431] "g"                                           
## [1432] "h"                                           
## [1433] "j"                                           
## [1434] "k"                                           
## [1435] "Simplifies"                                  
## [1436] "eliminating"                                 
## [1437] "options"                                     
## [1438] "95"                                          
## [1439] "Produces"                                    
## [1440] "outputs"                                     
## [1441] "includes"                                    
## [1442] "ensuring"                                    
## [1443] "result"                                      
## [1444] "from-base"                                   
## [1445] "R4DS"                                        
## [1446] "copyright"                                   
## [1447] "forcats1.0.0"                                
## [1448] "0.5.0Version"                                
## [1449] "0.4.0Version"                                
## [1450] "0.3.0Version"                                
## [1451] "factors"                                     
## [1452] "handle"                                      
## [1453] "categorical"                                 
## [1454] "Factors"                                     
## [1455] "reordering"                                  
## [1456] "improve"                                     
## [1457] "suite"                                       
## [1458] "Some"                                        
## [1459] "examples"                                    
## [1460] "include"                                     
## [1461] "fct_reorder"                                 
## [1462] "another"                                     
## [1463] "fct_infreq"                                  
## [1464] "frequency"                                   
## [1465] "fct_relevel"                                 
## [1466] "Changing"                                    
## [1467] "hand"                                        
## [1468] "fct_lump"                                    
## [1469] "Collapsing"                                  
## [1470] "least"                                       
## [1471] "frequent"                                    
## [1472] "is.na"                                       
## [1473] "count"                                       
## [1474] "sort"                                        
## [1475] "37"                                          
## [1476] "Twi'lek"                                     
## [1477] "Wookiee"                                     
## [1478] "Zabrak"                                      
## [1479] "Aleena"                                      
## [1480] "Besalisk"                                    
## [1481] "27"                                          
## [1482] "Other"                                       
## [1483] "39"                                          
## [1484] "geom_bar"                                    
## [1485] "history"                                     
## [1486] "stringsAsFactors"                            
## [1487] "unauthorized"                                
## [1488] "biography"                                   
## [1489] "Roger"                                       
## [1490] "Peng"                                        
## [1491] "sigh"                                        
## [1492] "Lumley"                                      
## [1493] "approaches"                                  
## [1494] "Wrangling"                                   
## [1495] "Amelia"                                      
## [1496] "McNamara"                                    
## [1497] "Nicholas"                                    
## [1498] "Horton"                                      
## [1499] "Github"                                      
## [1500] "lubridate1.9.4"                              
## [1501] "1.7.0"                                       
## [1502] "1.6.0"                                       
## [1503] "Date-time"                                   
## [1504] "frustrating"                                 
## [1505] "commands"                                    
## [1506] "date-times"                                  
## [1507] "unintuitive"                                 
## [1508] "date-time"                                   
## [1509] "being"                                       
## [1510] "Moreover"                                    
## [1511] "methods"                                     
## [1512] "must"                                        
## [1513] "robust"                                      
## [1514] "zones"                                       
## [1515] "leap"                                        
## [1516] "days"                                        
## [1517] "daylight"                                    
## [1518] "savings"                                     
## [1519] "times"                                       
## [1520] "related"                                     
## [1521] "quirks"                                      
## [1522] "lacks"                                       
## [1523] "capabilities"                                
## [1524] "situations"                                  
## [1525] "Lubridate"                                   
## [1526] "Features"                                    
## [1527] "warn.conflicts"                              
## [1528] "Easy"                                        
## [1529] "ymd"                                         
## [1530] "ymd_hms"                                     
## [1531] "dmy"                                         
## [1532] "dmy_hms"                                     
## [1533] "mdy"                                         
## [1534] "20101215"                                    
## [1535] "2010-12-15"                                  
## [1536] "2017-04-01"                                  
## [1537] "Simple"                                      
## [1538] "year"                                        
## [1539] "month"                                       
## [1540] "mday"                                        
## [1541] "hour"                                        
## [1542] "minute"                                      
## [1543] "bday"                                        
## [1544] "1979"                                        
## [1545] "wday"                                        
## [1546] "label"                                       
## [1547] "Sun"                                         
## [1548] "Levels"                                      
## [1549] "Mon"                                         
## [1550] "Tue"                                         
## [1551] "Wed"                                         
## [1552] "Thu"                                         
## [1553] "Fri"                                         
## [1554] "Sat"                                         
## [1555] "2016"                                        
## [1556] "Helper"                                      
## [1557] "handling"                                    
## [1558] "with_tz"                                     
## [1559] "force_tz"                                    
## [1560] "2010-12-13"                                  
## [1561] "30"                                          
## [1562] "UTC"                                         
## [1563] "Changes"                                     
## [1564] "printing"                                    
## [1565] "America"                                     
## [1566] "Chicago"                                     
## [1567] "09"                                          
## [1568] "CST"                                         
## [1569] "expands"                                     
## [1570] "mathematical"                                
## [1571] "performed"                                   
## [1572] "span"                                        
## [1573] "classes"                                     
## [1574] "borrowed"                                    
## [1575] "https://www.joda.org"                        
## [1576] "durations"                                   
## [1577] "measure"                                     
## [1578] "amount"                                      
## [1579] "points"                                      
## [1580] "periods"                                     
## [1581] "accurately"                                  
## [1582] "clock"                                       
## [1583] "despite"                                     
## [1584] "seconds"                                     
## [1585] "day"                                         
## [1586] "intervals"                                   
## [1587] "protean"                                     
## [1588] "information"                                 
## [1589] "GPL"                                         
## [1590] "Vitalie"                                     
## [1591] "Spinu"                                       
## [1592] "Garrett"                                     
## [1593] "Grolemund"                                   
## [1594] "2.1.2"

6.8.6.5 Keyword-in-contexts (kwic)

6.8.6.5.1 Pattern

You can see how keywords are used in the actual contexts in a concordance view produced by kwic(). Pattern is used to search for the pattern we are interested in (with * as a wildcard). Window used to display number of words/tokens around it.

web_pages_txt_corpus_tok_no_punct_types_tidy <- kwic(web_pages_txt_corpus_tok_no_punct, pattern =  "tidy*", window = 6)
head(web_pages_txt_corpus_tok_no_punct_types_tidy, 10)
## Keyword-in-context with 10 matches.                                                                            
##  [text1, 111]                 ggplot2 is to install the whole | tidyverse  |
##  [text1, 113] to install the whole tidyverse install.packages | tidyverse  |
##  [text1, 130]        from GitHub install.packages pak pak pak | tidyverse  |
##  [text1, 349]         you a comprehensive introduction to the | tidyverse  |
##  [text2, 318]                   dplyr is to install the whole | tidyverse  |
##  [text2, 320] to install the whole tidyverse install.packages | tidyverse  |
##  [text2, 357]        from GitHub install.packages pak pak pak | tidyverse  |
##    [text3, 4]                                 Skip to content | tidyr1.3.1 |
##    [text3, 5]                      Skip to content tidyr1.3.1 |    Tidy    |
##   [text3, 45]               Version 0.1.0 Changelog Ask AI ✨ |   tidyr    |
##                                                               
##  install.packages tidyverse Alternatively install just ggplot2
##  Alternatively install just ggplot2 install.packages ggplot2  
##  ggplot2 Cheatsheet Usage It’s hard to                        
##  and these two chapters will get                              
##  install.packages tidyverse Alternatively install just dplyr  
##  Alternatively install just dplyr install.packages dplyr      
##  dplyr Cheat Sheet Usage library dplyr                        
##  Tidy data Reference Articles Pivoting Rectangling            
##  data Reference Articles Pivoting Rectangling Nested          
##  Overview The goal of tidyr is
6.8.6.5.2 Phrase
web_pages_txt_corpus_tok_no_punct_phrase <- kwic(web_pages_txt_corpus_tok_no_punct, pattern =  phrase("the tidy*"), window = 6)
head(web_pages_txt_corpus_tok_no_punct_phrase, 10)
## Keyword-in-context with 3 matches.                                                                     
##  [text1, 348:349]          give you a comprehensive introduction to |
##  [text3, 109:110]         that is used wherever possible throughout |
##  [text3, 477:478] please use community.rstudio.com Please note that |
##                                                        
##  the tidyverse | and these two chapters will get       
##  the tidyverse | If you ensure that your data          
##    the tidyr   | project is released with a Contributor

6.8.6.6 stopwords

stopwords are function words (or grammatical words). We can search for these and remove them (if not necessary). This step is often useful because we are not interested in these stop_words.

tokens_remove() is an alias to tokens_select(selection = "remove")

web_pages_txt_corpus_tok_no_punct_no_Stop <- tokens_select(web_pages_txt_corpus_tok_no_punct, pattern = stopwords("en", source = "stopwords-iso"), selection = "remove")
web_pages_txt_corpus_tok_no_punct_no_Stop
## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "content"      "ggplot23.5.2" "started"      "Reference"   
##  [6] "News"         "Releases"     "Version"      "3.5.0"        "Version"     
## [11] "3.4.0"        "Version"     
## [ ... and 339 more ]
## 
## text2 :
##  [1] "Skip"       "content"    "dplyr1.1.4" "started"    "Reference" 
##  [6] "Articles"   "data"       "Two-table"  "verbs"      "dplyr"     
## [11] "<"          ">"         
## [ ... and 733 more ]
## 
## text3 :
##  [1] "Skip"        "content"     "tidyr1.3.1"  "Tidy"        "data"       
##  [6] "Reference"   "Articles"    "Pivoting"    "Rectangling" "Nested"     
## [11] "data"        "articles"   
## [ ... and 322 more ]
## 
## text4 :
##  [1] "Skip"       "content"    "readr2.1.5" "started"    "Reference" 
##  [6] "Articles"   "Column"     "type"       "Locales"    "News"      
## [11] "Releases"   "Version"   
## [ ... and 884 more ]
## 
## text5 :
##  [1] "Skip"        "content"     "purrr1.1.0"  "Reference"   "Articles"   
##  [6] "purrr"       "<"           ">"           "base"        "Functional" 
## [11] "programming" "languages"  
## [ ... and 227 more ]
## 
## text6 :
##  [1] "Skip"        "content"     "tibble3.3.0" "started"     "Reference"  
##  [6] "Articles"    "Column"      "types"       "Controlling" "display"    
## [11] "Comparing"   "display"    
## [ ... and 338 more ]
## 
## [ reached max_ndoc ... 3 more documents ]

6.8.6.7 wordstem

To be able to extract the stems of each of the given words, we use the function tokens_wordstem, char_wordstem or dfm_wordstem

6.8.6.7.1 tokens_wordstem
web_pages_txt_corpus_tok_no_punct_no_Stop_stem <- tokens_wordstem(web_pages_txt_corpus_tok_no_punct_no_Stop)
web_pages_txt_corpus_tok_no_punct_no_Stop_stem
## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "content"      "ggplot23.5.2" "start"        "Refer"       
##  [6] "New"          "Releas"       "Version"      "3.5.0"        "Version"     
## [11] "3.4.0"        "Version"     
## [ ... and 339 more ]
## 
## text2 :
##  [1] "Skip"       "content"    "dplyr1.1.4" "start"      "Refer"     
##  [6] "Articl"     "data"       "Two-tabl"   "verb"       "dplyr"     
## [11] "<"          ">"         
## [ ... and 733 more ]
## 
## text3 :
##  [1] "Skip"       "content"    "tidyr1.3.1" "Tidi"       "data"      
##  [6] "Refer"      "Articl"     "Pivot"      "Rectangl"   "Nest"      
## [11] "data"       "articl"    
## [ ... and 322 more ]
## 
## text4 :
##  [1] "Skip"       "content"    "readr2.1.5" "start"      "Refer"     
##  [6] "Articl"     "Column"     "type"       "Local"      "New"       
## [11] "Releas"     "Version"   
## [ ... and 884 more ]
## 
## text5 :
##  [1] "Skip"       "content"    "purrr1.1.0" "Refer"      "Articl"    
##  [6] "purrr"      "<"          ">"          "base"       "Function"  
## [11] "program"    "languag"   
## [ ... and 227 more ]
## 
## text6 :
##  [1] "Skip"        "content"     "tibble3.3.0" "start"       "Refer"      
##  [6] "Articl"      "Column"      "type"        "Control"     "display"    
## [11] "Compar"      "display"    
## [ ... and 338 more ]
## 
## [ reached max_ndoc ... 3 more documents ]
6.8.6.7.2 dfm_wordstem

Here we can use the dfm (for Document Feature Matrix) to obtain details of the wordstems used in each of the texts

web_pages_txt_corpus_tok_no_punct_no_Stop_stem_dfm <- dfm(web_pages_txt_corpus_tok_no_punct_no_Stop)
dfm_wordstem(web_pages_txt_corpus_tok_no_punct_no_Stop_stem_dfm)
## Document-feature matrix of: 9 documents, 1,029 features (79.88% sparse) and 4 docvars.
##        features
## docs    skip content ggplot23.5.2 start refer news releas version 3.5.0 3.4.0
##   text1    1       1            1     4     1    1      1      11     1     1
##   text2    1       1            0     2     1    1      2      10     0     0
##   text3    1       1            0     1     1    1      2      13     0     0
##   text4    2       1            0     2     1    1      3       3     0     0
##   text5    1       1            0     1     1    1      1       5     0     0
##   text6    1       1            0     2     1    1      1       7     0     0
## [ reached max_ndoc ... 3 more documents, reached max_nfeat ... 1,019 more features ]