6.8 Multiple webpages

6.8.1 Read_html

website <- "https://www.tidyverse.org/packages/" %>% 
  rvest::read_html()
website

## {html_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n<div id="appTidyverseSite" class="shrinkHeader alwaysShrinkHeader ...

a_elements <- website %>% 
  rvest::html_elements(css = "div.package > a")
a_elements

## {xml_nodeset (9)}
## [1] <a href="https://ggplot2.tidyverse.org/" target="_blank"><img class="pack ...
## [2] <a href="https://dplyr.tidyverse.org/" target="_blank"><img class="packag ...
## [3] <a href="https://tidyr.tidyverse.org/" target="_blank"><img class="packag ...
## [4] <a href="https://readr.tidyverse.org/" target="_blank"><img class="packag ...
## [5] <a href="https://purrr.tidyverse.org/" target="_blank"><img class="packag ...
## [6] <a href="https://tibble.tidyverse.org/" target="_blank"><img class="packa ...
## [7] <a href="https://stringr.tidyverse.org/" target="_blank"><img class="pack ...
## [8] <a href="https://forcats.tidyverse.org/" target="_blank"><img class="pack ...
## [9] <a href="https://lubridate.tidyverse.org/" target="_blank"><img class="pa ...

6.8.2 Extract headline

links <- a_elements %>%
  rvest::html_attr(name = "href")
links

## [1] "https://ggplot2.tidyverse.org/"   "https://dplyr.tidyverse.org/"    
## [3] "https://tidyr.tidyverse.org/"     "https://readr.tidyverse.org/"    
## [5] "https://purrr.tidyverse.org/"     "https://tibble.tidyverse.org/"   
## [7] "https://stringr.tidyverse.org/"   "https://forcats.tidyverse.org/"  
## [9] "https://lubridate.tidyverse.org/"

6.8.3 Extract subpages

pages <- links %>% 
  map(rvest::read_html)
pages

## [[1]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[2]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[3]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[4]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[5]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[6]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[7]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[8]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...
## 
## [[9]]
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <a href="#container" class="visually-hidden-focusable">Skip t ...

The structure seems to be similar across all pages

pages %>% 
  map(rvest::html_element, css = "a.navbar-brand") %>% 
  map_chr(rvest::html_text)

## [1] "ggplot2"   "dplyr"     "tidyr"     "readr"     "purrr"     "tibble"   
## [7] "stringr"   "forcats"   "lubridate"

and extracting version number

pages %>% 
  map(rvest::html_element, css = "small.nav-text.text-muted.me-auto") %>% 
  map_chr(rvest::html_text)

## [1] "4.0.1" "1.1.4" "1.3.1" "2.1.6" "1.2.0" "3.3.0" "1.6.0" "1.0.1" "1.9.4"

and we can also add all into a tibble

6.8.4 Extract text

pages_table <- tibble(
  name = pages %>% 
    map(rvest::html_element, css = "a.navbar-brand") %>% 
    map_chr(rvest::html_text),
  version = pages %>% 
    map(rvest::html_element, css = "small.nav-text.text-muted.me-auto") %>% 
    map_chr(rvest::html_text),
  CRAN = pages %>% 
    map(rvest::html_element, css = "ul.list-unstyled > li:nth-child(1) > a") %>% 
    map_chr(rvest::html_attr, name = "href"),
  Learn = pages %>% 
    map(rvest::html_element, css = "ul.list-unstyled > li:nth-child(4) > a") %>% 
    map_chr(rvest::html_attr, name = "href"), 
  text = pages %>%
    map(rvest::html_element,  css = "body") %>%
    map_chr(rvest::html_text2)
)
pages_table

## # A tibble: 9 × 5
##   name      version CRAN                                          Learn    text 
##   <chr>     <chr>   <chr>                                         <chr>    <chr>
## 1 ggplot2   4.0.1   https://cloud.r-project.org/package=ggplot2   https:/… "Ski…
## 2 dplyr     1.1.4   https://cloud.r-project.org/package=dplyr     http://… "Ski…
## 3 tidyr     1.3.1   https://cloud.r-project.org/package=tidyr     https:/… "Ski…
## 4 readr     2.1.6   https://cloud.r-project.org/package=readr     http://… "Ski…
## 5 purrr     1.2.0   https://cloud.r-project.org/package=purrr     http://… "Ski…
## 6 tibble    3.3.0   https://cloud.r-project.org/package=tibble    https:/… "Ski…
## 7 stringr   1.6.0   https://cloud.r-project.org/package=stringr   http://… "Ski…
## 8 forcats   1.0.1   https://cloud.r-project.org/package=forcats   http://… "Ski…
## 9 lubridate 1.9.4   https://cloud.r-project.org/package=lubridate https:/… "Ski…

6.8.5 Create a corpus

web_pages_txt_corpus <- corpus(pages_table)
print(web_pages_txt_corpus)

## Corpus consisting of 9 documents and 4 docvars.
## text1 :
## "Skip to content ggplot24.0.1 Get started Reference News Rele..."
## 
## text2 :
## "Skip to content dplyr1.1.4 Get started Reference Articles Gr..."
## 
## text3 :
## "Skip to content tidyr1.3.1 Tidy data Reference Articles Pivo..."
## 
## text4 :
## "Skip to content readr2.1.6 Get started Reference Articles Co..."
## 
## text5 :
## "Skip to content purrr1.2.0 Get started Reference Articles pu..."
## 
## text6 :
## "Skip to content tibble3.3.0 Get started Reference Articles C..."
## 
## [ reached max_ndoc ... 3 more documents ]

6.8.5.1 Summary

summary(web_pages_txt_corpus, 10)

## Corpus consisting of 9 documents, showing 9 documents:
## 
##   Text Types Tokens Sentences      name version
##  text1   377    797        25   ggplot2   4.0.1
##  text2   419   1258        17     dplyr   1.1.4
##  text3   326    729        25     tidyr   1.3.1
##  text4   569   1751        47     readr   2.1.6
##  text5   251    499        11     purrr   1.2.0
##  text6   269    717        14    tibble   3.3.0
##  text7   401   1355        23   stringr   1.6.0
##  text8   264    645        14   forcats   1.0.1
##  text9   267    650        11 lubridate   1.9.4
##                                           CRAN
##    https://cloud.r-project.org/package=ggplot2
##      https://cloud.r-project.org/package=dplyr
##      https://cloud.r-project.org/package=tidyr
##      https://cloud.r-project.org/package=readr
##      https://cloud.r-project.org/package=purrr
##     https://cloud.r-project.org/package=tibble
##    https://cloud.r-project.org/package=stringr
##    https://cloud.r-project.org/package=forcats
##  https://cloud.r-project.org/package=lubridate
##                                           Learn
##  https://r4ds.had.co.nz/data-visualisation.html
##            http://r4ds.had.co.nz/transform.html
##                https://r4ds.hadley.nz/data-tidy
##          http://r4ds.had.co.nz/data-import.html
##            http://r4ds.had.co.nz/iteration.html
##             https://r4ds.had.co.nz/tibbles.html
##              http://r4ds.hadley.nz/strings.html
##              http://r4ds.had.co.nz/factors.html
##           https://r4ds.hadley.nz/datetimes.html

6.8.5.2 Accessing parts of corpus

web_pages_txt_corpus[[4]]

## [1] "Skip to content\nreadr2.1.6\nGet started\nReference\nArticles\nColumn type\nLocales\nNews\nReleases\nVersion 2.1.0\nVersion 2.0.0\nVersion 1.4.0\nVersion 1.3.1\nVersion 1.0.0\nVersion 0.2.0\nVersion 0.1.0\nChangelog\nreadr\nOverview\n\nThe goal of readr is to provide a fast and friendly way to read rectangular data from delimited files, such as comma-separated values (CSV) and tab-separated values (TSV). It is designed to parse many types of data found in the wild, while providing an informative problem report when parsing leads to unexpected results. If you are new to readr, the best place to start is the data import chapter in R for Data Science.\n\nInstallation\n\n# The easiest way to get readr is to install the whole tidyverse:\ninstall.packages(\"tidyverse\")\n\n# Alternatively, install just readr:\ninstall.packages(\"readr\")\nCheatsheet\n\nUsage\n\nreadr is part of the core tidyverse, so you can load it with:\n\n\nlibrary(tidyverse)\n#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──\n#> ✔ dplyr     1.1.4          ✔ readr     2.1.5.9000\n#> ✔ forcats   1.0.1          ✔ stringr   1.5.2     \n#> ✔ ggplot2   4.0.0          ✔ tibble    3.3.0     \n#> ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     \n#> ✔ purrr     1.1.0          \n#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n#> ✖ dplyr::filter() masks stats::filter()\n#> ✖ dplyr::lag()    masks stats::lag()\n#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors\n\nOf course, you can also load readr as an individual package:\n\n\nlibrary(readr)\n\nTo read a rectangular dataset with readr, you combine two pieces: a function that parses the lines of the file into individual fields and a column specification.\n\nreadr supports the following file formats with these read_*() functions:\n\nread_csv(): comma-separated values (CSV)\nread_tsv(): tab-separated values (TSV)\nread_csv2(): semicolon-separated values with , as the decimal mark\nread_delim(): delimited files (CSV and TSV are important special cases)\nread_fwf(): fixed-width files\nread_table(): whitespace-separated files\nread_log(): web log files\n\nA column specification describes how each column should be converted from a character vector to a specific data type (e.g. character, numeric, datetime, etc.). In the absence of a column specification, readr will guess column types from the data. vignette(\"column-types\") gives more detail on how readr guesses the column types. Column type guessing is very handy, especially during data exploration, but it’s important to remember these are just guesses. As any data analysis project matures past the exploratory phase, the best strategy is to provide explicit column types.\n\nThe following example loads a sample file bundled with readr and guesses the column types:\n\n\n(chickens <- read_csv(readr_example(\"chickens.csv\")))\n#> Rows: 5 Columns: 4\n#> ── Column specification ────────────────────────────────────────────────────────\n#> Delimiter: \",\"\n#> chr (3): chicken, sex, motto\n#> dbl (1): eggs_laid\n#> \n#> ℹ Use `spec()` to retrieve the full column specification for this data.\n#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.\n#> # A tibble: 5 × 4\n#>   chicken                 sex     eggs_laid motto                               \n#>   <chr>                   <chr>       <dbl> <chr>                               \n#> 1 Foghorn Leghorn         rooster         0 That's a joke, ah say, that's a jok…\n#> 2 Chicken Little          hen             3 The sky is falling!                 \n#> 3 Ginger                  hen            12 Listen. We'll either die free chick…\n#> 4 Camilla the Chicken     hen             7 Bawk, buck, ba-gawk.                \n#> 5 Ernie The Giant Chicken rooster         0 Put Captain Solo in the cargo hold.\n\nNote that readr prints the column types – the guessed column types, in this case. This is useful because it allows you to check that the columns have been read in as you expect. If they haven’t, that means you need to provide the column specification. This sounds like a lot of trouble, but luckily readr affords a nice workflow for this. Use spec() to retrieve the (guessed) column specification from your initial effort.\n\n\nspec(chickens)\n#> cols(\n#>   chicken = col_character(),\n#>   sex = col_character(),\n#>   eggs_laid = col_double(),\n#>   motto = col_character()\n#> )\n\nNow you can copy, paste, and tweak this, to create a more explicit readr call that expresses the desired column types. Here we express that sex should be a factor with levels rooster and hen, in that order, and that eggs_laid should be integer.\n\n\nchickens <- read_csv(\n  readr_example(\"chickens.csv\"),\n  col_types = cols(\n    chicken   = col_character(),\n    sex       = col_factor(levels = c(\"rooster\", \"hen\")),\n    eggs_laid = col_integer(),\n    motto     = col_character()\n  )\n)\nchickens\n#> # A tibble: 5 × 4\n#>   chicken                 sex     eggs_laid motto                               \n#>   <chr>                   <fct>       <int> <chr>                               \n#> 1 Foghorn Leghorn         rooster         0 That's a joke, ah say, that's a jok…\n#> 2 Chicken Little          hen             3 The sky is falling!                 \n#> 3 Ginger                  hen            12 Listen. We'll either die free chick…\n#> 4 Camilla the Chicken     hen             7 Bawk, buck, ba-gawk.                \n#> 5 Ernie The Giant Chicken rooster         0 Put Captain Solo in the cargo hold.\n\nvignette(\"readr\") gives an expanded introduction to readr.\n\nEditions\n\nreadr got a new parsing engine in version 2.0.0 (released July 2021). In this so-called second edition, readr calls vroom::vroom(), by default.\n\nThe parsing engine in readr versions prior to 2.0.0 is now called the first edition. If you’re using readr >= 2.0.0, you can still access first edition parsing via the functions with_edition(1, ...) and local_edition(1). And, obviously, if you’re using readr < 2.0.0, you will get first edition parsing, by definition, because that’s all there is.\n\nWe will continue to support the first edition for a number of releases, but the overall goal is to make the second edition uniformly better than the first. Therefore the plan is to eventually deprecate and then remove the first edition code. New code and actively-maintained code should use the second edition. The workarounds with_edition(1, ...) and local_edition(1) are offered as a pragmatic way to patch up legacy code or as a temporary solution for infelicities identified as the second edition matures.\n\nAlternatives\n\nThere are two main alternatives to readr: base R and data.table’s fread(). The most important differences are discussed below.\n\nBase R\n\nCompared to the corresponding base functions, readr functions:\n\nUse a consistent naming scheme for the parameters (e.g. col_names and col_types not header and colClasses).\n\nAre generally much faster (up to 10x-100x) depending on the dataset.\n\nLeave strings as is by default, and automatically parse common date/time formats.\n\nHave a helpful progress bar if loading is going to take a while.\n\nAll functions work exactly the same way regardless of the current locale. To override the US-centric defaults, use locale().\n\ndata.table and fread()\n\ndata.table has a function similar to read_csv() called fread(). Compared to fread(), readr functions:\n\nAre sometimes slower, particularly on numeric heavy data.\n\nCan automatically guess some parameters, but basically encourage explicit specification of, e.g., the delimiter, skipped rows, and the header row.\n\nFollow tidyverse-wide conventions, such as returning a tibble, a standard approach for column name repair, and a common mini-language for column selection.\n\nAcknowledgements\n\nThanks to:\n\nJoe Cheng for showing me the beauty of deterministic finite automata for parsing, and for teaching me why I should write a tokenizer.\n\nJJ Allaire for helping me come up with a design that makes very few copies, and is easy to extend.\n\nDirk Eddelbuettel for coming up with the name!\n\nLinks\nView on CRAN\nBrowse source code\nReport a bug\nLearn more\nLicense\nFull license\nMIT + file LICENSE\nCommunity\nContributing guide\nCode of conduct\nGetting help\nCitation\nCiting readr\nDevelopers\nHadley Wickham\nAuthor\nJim Hester\nAuthor\nJennifer Bryan\nAuthor, maintainer\n\nCopyright holder, funder\nMore about authors...\n\nDeveloped by Hadley Wickham, Jim Hester, Jennifer Bryan, .\n\nSite built with pkgdown 2.2.0."

6.8.5.3 Document-level information

head(docvars(web_pages_txt_corpus))

##      name version                                        CRAN
## 1 ggplot2   4.0.1 https://cloud.r-project.org/package=ggplot2
## 2   dplyr   1.1.4   https://cloud.r-project.org/package=dplyr
## 3   tidyr   1.3.1   https://cloud.r-project.org/package=tidyr
## 4   readr   2.1.6   https://cloud.r-project.org/package=readr
## 5   purrr   1.2.0   https://cloud.r-project.org/package=purrr
## 6  tibble   3.3.0  https://cloud.r-project.org/package=tibble
##                                            Learn
## 1 https://r4ds.had.co.nz/data-visualisation.html
## 2           http://r4ds.had.co.nz/transform.html
## 3               https://r4ds.hadley.nz/data-tidy
## 4         http://r4ds.had.co.nz/data-import.html
## 5           http://r4ds.had.co.nz/iteration.html
## 6            https://r4ds.had.co.nz/tibbles.html

6.8.6 Tokens

tokens() segments texts in a corpus into tokens (words or sentences) by word boundaries. We can remove punctuations or not

6.8.6.1 With punctuations

web_pages_txt_corpus_tok <- tokens(web_pages_txt_corpus)
web_pages_txt_corpus_tok

## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "to"           "content"      "ggplot24.0.1" "Get"         
##  [6] "started"      "Reference"    "News"         "Releases"     "Version"     
## [11] "4.0.0"        "Version"     
## [ ... and 785 more ]
## 
## text2 :
##  [1] "Skip"       "to"         "content"    "dplyr1.1.4" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Grouped"    "data"      
## [11] "Two-table"  "verbs"     
## [ ... and 1,246 more ]
## 
## text3 :
##  [1] "Skip"        "to"          "content"     "tidyr1.3.1"  "Tidy"       
##  [6] "data"        "Reference"   "Articles"    "Pivoting"    "Rectangling"
## [11] "Nested"      "data"       
## [ ... and 717 more ]
## 
## text4 :
##  [1] "Skip"       "to"         "content"    "readr2.1.6" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Column"     "type"      
## [11] "Locales"    "News"      
## [ ... and 1,739 more ]
## 
## text5 :
##  [1] "Skip"       "to"         "content"    "purrr1.2.0" "Get"       
##  [6] "started"    "Reference"  "Articles"   "purrr"      "<"         
## [11] "-"          ">"         
## [ ... and 487 more ]
## 
## text6 :
##  [1] "Skip"        "to"          "content"     "tibble3.3.0" "Get"        
##  [6] "started"     "Reference"   "Articles"    "Column"      "types"      
## [11] "Controlling" "display"    
## [ ... and 705 more ]
## 
## [ reached max_ndoc ... 3 more documents ]

6.8.6.2 Without punctuations

web_pages_txt_corpus_tok_no_punct <- tokens(web_pages_txt_corpus, remove_punct = TRUE)
web_pages_txt_corpus_tok_no_punct

## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "to"           "content"      "ggplot24.0.1" "Get"         
##  [6] "started"      "Reference"    "News"         "Releases"     "Version"     
## [11] "4.0.0"        "Version"     
## [ ... and 652 more ]
## 
## text2 :
##  [1] "Skip"       "to"         "content"    "dplyr1.1.4" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Grouped"    "data"      
## [11] "Two-table"  "verbs"     
## [ ... and 988 more ]
## 
## text3 :
##  [1] "Skip"        "to"          "content"     "tidyr1.3.1"  "Tidy"       
##  [6] "data"        "Reference"   "Articles"    "Pivoting"    "Rectangling"
## [11] "Nested"      "data"       
## [ ... and 547 more ]
## 
## text4 :
##  [1] "Skip"       "to"         "content"    "readr2.1.6" "Get"       
##  [6] "started"    "Reference"  "Articles"   "Column"     "type"      
## [11] "Locales"    "News"      
## [ ... and 1,370 more ]
## 
## text5 :
##  [1] "Skip"       "to"         "content"    "purrr1.2.0" "Get"       
##  [6] "started"    "Reference"  "Articles"   "purrr"      "<"         
## [11] ">"          "base"      
## [ ... and 380 more ]
## 
## text6 :
##  [1] "Skip"        "to"          "content"     "tibble3.3.0" "Get"        
##  [6] "started"     "Reference"   "Articles"    "Column"      "types"      
## [11] "Controlling" "display"    
## [ ... and 536 more ]
## 
## [ reached max_ndoc ... 3 more documents ]

6.8.7 Stop words

It is best to remove stop words (function/grammatical words) when we use statistical analyses of a corpus.

web_pages_txt_corpus_tok_no_punct_no_Stop <- tokens_select(web_pages_txt_corpus_tok_no_punct, pattern = stopwords("en", source = "stopwords-iso"), selection = "remove")
web_pages_txt_corpus_tok_no_punct_no_Stop

## Tokens consisting of 9 documents and 4 docvars.
## text1 :
##  [1] "Skip"         "content"      "ggplot24.0.1" "started"      "Reference"   
##  [6] "News"         "Releases"     "Version"      "4.0.0"        "Version"     
## [11] "3.5.0"        "Version"     
## [ ... and 349 more ]
## 
## text2 :
##  [1] "Skip"       "content"    "dplyr1.1.4" "started"    "Reference" 
##  [6] "Articles"   "data"       "Two-table"  "verbs"      "dplyr"     
## [11] "<"          ">"         
## [ ... and 733 more ]
## 
## text3 :
##  [1] "Skip"        "content"     "tidyr1.3.1"  "Tidy"        "data"       
##  [6] "Reference"   "Articles"    "Pivoting"    "Rectangling" "Nested"     
## [11] "data"        "articles"   
## [ ... and 322 more ]
## 
## text4 :
##  [1] "Skip"       "content"    "readr2.1.6" "started"    "Reference" 
##  [6] "Articles"   "Column"     "type"       "Locales"    "News"      
## [11] "Releases"   "Version"   
## [ ... and 890 more ]
## 
## text5 :
##  [1] "Skip"        "content"     "purrr1.2.0"  "started"     "Reference"  
##  [6] "Articles"    "purrr"       "<"           ">"           "base"       
## [11] "Functional"  "programming"
## [ ... and 230 more ]
## 
## text6 :
##  [1] "Skip"        "content"     "tibble3.3.0" "started"     "Reference"  
##  [6] "Articles"    "Column"      "types"       "Controlling" "display"    
## [11] "Comparing"   "display"    
## [ ... and 338 more ]
## 
## [ reached max_ndoc ... 3 more documents ]

6.8.8 Statistical analyses

We can start by providing statistics (whether descriptives or inferential) based on our corpora.

6.8.8.1 Simple frequency analysis

Here we look at obtaining a simple frequency analysis of usage.

6.8.8.1.1 DFM

We start by generating a DFM (document-feature matrix)

web_pages_txt_corpus_tok_no_punct_no_Stop_dfm <- dfm(web_pages_txt_corpus_tok_no_punct_no_Stop)
web_pages_txt_corpus_tok_no_punct_no_Stop_dfm

## Document-feature matrix of: 9 documents, 1,176 features (80.94% sparse) and 4 docvars.
##        features
## docs    skip content ggplot24.0.1 started reference news releases version 4.0.0
##   text1    1       1            1       1         1    1        1      12     1
##   text2    1       1            0       1         1    1        1      10     0
##   text3    1       1            0       1         1    1        1      13     0
##   text4    1       1            0       1         1    1        2       8     1
##   text5    1       1            0       1         1    1        1       6     0
##   text6    1       1            0       1         1    1        1       7     0
##        features
## docs    3.5.0
##   text1     1
##   text2     0
##   text3     0
##   text4     0
##   text5     0
##   text6     0
## [ reached max_ndoc ... 3 more documents, reached max_nfeat ... 1,166 more features ]

6.8.8.1.2 Frequencies

web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_freq <- textstat_frequency(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm)
web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_freq

##                                           feature frequency rank docfreq group
## 1                                               >       271    1       7   all
## 2                                               ─       132    2       1   all
## 3                                               <        91    3       8   all
## 4                                         version        74    4       9   all
## 5                                               1        72    5       6   all
## 6                                            data        55    6       9   all
## 7                                               2        47    7       7   all
## 8                                               3        39    8       5   all
## 9                                               =        38    9       8   all
## 10                                      tidyverse        34   10       9   all
## 11                                           code        32   11       9   all
## 12                                         tibble        32   11       4   all
## 13                                            chr        31   13       4   all
## 14                                         author        30   14       9   all
## 15                                      functions        29   15       7   all
## 16                                          readr        29   15       1   all
## 17                                        ggplot2        28   17       3   all
## 18                                         column        28   17       3   all
## 19                               install.packages        26   19       9   all
## 20                                        license        26   19       9   all
## 21                                              4        25   21       7   all
## 22                                              5        23   22       5   all
## 23                                          dplyr        22   23       3   all
## 24                                          learn        20   24       9   all
## 25                                        install        19   25       9   all
## 26                                            pak        18   26       6   all
## 27                                         hadley        18   26       9   all
## 28                                        wickham        18   26       9   all
## 29                                           time        18   26       5   all
## 30                                              +        16   30       8   all
## 31                                          tidyr        16   30       2   all
## 32                                        library        14   32       9   all
## 33                                           file        14   32       8   all
## 34                                         values        14   32       4   all
## 35                                        stringr        14   32       2   all
## 36                                          purrr        14   32       2   all
## 37                                            bug        13   37       9   all
## 38                                        conduct        13   37       9   all
## 39                                        species        13   37       2   all
## 40                                            dbl        13   37       3   all
## 41                                      community        12   41       9   all
## 42                                          types        12   41       3   all
## 43                                              ×        12   41       4   all
## 44                                        forcats        12   41       2   all
## 45                                      lubridate        12   41       2   all
## 46                                       articles        11   46       7   all
## 47                                          start        11   46       8   all
## 48                                       vignette        11   46       6   all
## 49                                              ℹ        11   46       3   all
## 50                                        chicken        11   46       1   all
## 51                                              |        11   46       2   all
## 52                                        started        10   52       9   all
## 53                                       releases        10   52       9   all
## 54                                          1.0.0        10   52       7   all
## 55                                         github        10   52       7   all
## 56                                           list        10   52       3   all
## 57                                         source        10   52       9   all
## 58                                         report        10   52       9   all
## 59                                   contributing        10   52       8   all
## 60                                          built        10   52       9   all
## 61                                           mass        10   52       1   all
## 62                                          aeiou        10   52       1   all
## 63                                        pattern        10   52       1   all
## 64                                           skip         9   64       9   all
## 65                                        content         9   64       9   all
## 66                                      reference         9   64       9   all
## 67                                           news         9   64       9   all
## 68                                      changelog         9   64       9   all
## 69                                       overview         9   64       9   all
## 70                                            map         9   64       2   all
## 71                                      variables         9   64       4   all
## 72                                   installation         9   64       9   all
## 73                                        easiest         9   64       9   all
## 74                                  alternatively         9   64       9   all
## 75                                    development         9   64       7   all
## 76                                        science         9   64       8   all
## 77                                         common         9   64       5   all
## 78                                            set         9   64       7   all
## 79                                          links         9   64       9   all
## 80                                           view         9   64       9   all
## 81                                           cran         9   64       9   all
## 82                                         browse         9   64       9   all
## 83                                          guide         9   64       8   all
## 84                                       citation         9   64       9   all
## 85                                         citing         9   64       9   all
## 86                                     developers         9   64       9   all
## 87                                     maintainer         9   64       9   all
## 88                                      copyright         9   64       8   all
## 89                                         holder         9   64       8   all
## 90                                      developed         9   64       9   all
## 91                                        pkgdown         9   64       9   all
## 92                                           base         9   64       4   all
## 93                                       starwars         9   64       2   all
## 94                                           true         9   64       4   all
## 95                                  specification         9   64       2   all
## 96                                              ✔         9   64       1   all
## 97                                        edition         9   64       1   all
## 98                                          2.0.0         8   98       3   all
## 99                                            mit         8   98       8   all
## 100                                        funder         8   98       8   all
## 101                                          rows         8   98       3   all
## 102                                             6         8   98       4   all
## 103                                           sex         8   98       2   all
## 104                                           red         8   98       1   all
## 105                                     character         8   98       5   all
## 106                                           hen         8   98       1   all
## 107                                        string         8   98       1   all
## 108                                            30         8   98       1   all
## 109                                           faq         7  109       1   all
## 110                                      graphics         7  109       1   all
## 111                                    cheatsheet         7  109       7   all
## 112                                          read         7  109       5   all
## 113                                        filter         7  109       3   all
## 114                                       chapter         7  109       7   all
## 115                                    data.table         7  109       4   all
## 116                                     eye_color         7  109       2   all
## 117                                       project         7  109       4   all
## 118                                          tidy         7  109       1   all
## 119                                          type         7  109       4   all
## 120                                       parsing         7  109       2   all
## 121                                       strings         7  109       2   all
## 122                                         match         7  109       1   all
## 123                                       factors         7  109       1   all
## 124                                       provide         6  124       4   all
## 125                                         usage         6  124       6   all
## 126                                        lionel         6  124       3   all
## 127                                         henry         6  124       3   all
## 128                                    operations         6  124       3   all
## 129                                      multiple         6  124       4   all
## 130                                        single         6  124       3   all
## 131                                       tibbles         6  124       3   all
## 132                                      datasets         6  124       2   all
## 133                                         2.1.3         6  124       6   all
## 134                                       columns         6  124       3   all
## 135                                       missing         6  124       2   all
## 136                                     eggs_laid         6  124       1   all
## 137                                       rooster         6  124       1   all
## 138                                             0         6  124       2   all
## 139                                        inputs         6  124       2   all
## 140                                       regular         6  124       1   all
## 141                                        create         5  141       4   all
## 142                                     questions         5  141       4   all
## 143                                         verbs         5  141       2   all
## 144                                         1.1.0         5  141       5   all
## 145                                         names         5  141       3   all
## 146                                         extra         5  141       2   all
## 147                                         white         5  141       1   all
## 148                                          blue         5  141       1   all
## 149                                            15         5  141       2   all
## 150                                           row         5  141       3   all
## 151                                         brown         5  141       1   all
## 152                                         mascu         5  141       1   all
## 153                                      variable         5  141       3   all
## 154                                      explicit         5  141       2   all
## 155                                       package         5  141       4   all
## 156                                         files         5  141       1   all
## 157                                           e.g         5  141       2   all
## 158                                         motto         5  141       1   all
## 159                                 col_character         5  141       1   all
## 160                                        factor         5  141       2   all
## 161                                        levels         5  141       3   all
## 162                                          easy         5  141       3   all
## 163                                       vectors         5  141       3   all
## 164                                      argument         5  141       2   all
## 165                                   expressions         5  141       1   all
## 166                                      devtools         5  141       2   all
## 167                                          bday         5  141       1   all
## 168                                    reordering         4  168       2   all
## 169                                       details         4  168       2   all
## 170                                        ggplot         4  168       2   all
## 171                                           aes         4  168       2   all
## 172                                      existing         4  168       3   all
## 173                                      designed         4  168       4   all
## 174                                        thomas         4  168       2   all
## 175                                     recommend         4  168       3   all
## 176                                         solve         4  168       4   all
## 177                                        pieces         4  168       4   all
## 178                                          main         4  168       4   all
## 179                                  reproducible         4  168       4   all
## 180                                   programming         4  168       3   all
## 181                                    consistent         4  168       4   all
## 182                                        mutate         4  168       2   all
## 183                                       summary         4  168       3   all
## 184                                         droid         4  168       2   all
## 185                                        height         4  168       1   all
## 186                                        yellow         4  168       1   all
## 187                                       masculi         4  168       1   all
## 188                                         human         4  168       3   all
## 189                                          note         4  168       4   all
## 190                                      released         4  168       4   all
## 191                                        kirill         4  168       2   all
## 192                                        müller         4  168       2   all
## 193                                         davis         4  168       2   all
## 194                                       vaughan         4  168       2   all
## 195                                         1.2.0         4  168       4   all
## 196                                          goal         4  168       3   all
## 197                                         pivot         4  168       1   all
## 198                                       authors         4  168       4   all
## 199                                      function         4  168       2   all
## 200                                      read_csv         4  168       1   all
## 201                                        vector         4  168       3   all
## 202                                      chickens         4  168       1   all
## 203                                             `         4  168       1   all
## 204                                        you’re         4  168       3   all
## 205                                         fread         4  168       1   all
## 206                                       extract         4  168       2   all
## 207                                             ~         4  168       2   all
## 208                                    components         4  168       3   all
## 209                                        easily         4  168       2   all
## 210                                      matching         4  168       2   all
## 211                                       letters         4  168       2   all
## 212                                       stringi         4  168       1   all
## 213                                         video         4  168       1   all
## 214                                         cross         4  168       1   all
## 215                                          deal         4  168       1   all
## 216                                     authority         4  168       1   all
## 217                                         addin         4  168       1   all
## 218                                     date-time         4  168       1   all
## 219                                    date-times         4  168       1   all
## 220                                    2010-12-13         4  168       1   all
## 221                                         2.2.0         3  221       2   all
## 222                                      packages         3  221       3   all
## 223                                   performance         3  221       2   all
## 224                                            ✨         3  221       3   all
## 225                                         based         3  221       2   all
## 226                                          hard         3  221       2   all
## 227                                       dataset         3  221       2   all
## 228                                    coord_flip         3  221       2   all
## 229                                      changing         3  221       2   all
## 230                                  introduction         3  221       2   all
## 231                                    individual         3  221       2   all
## 232                                         you’d         3  221       2   all
## 233                                          kara         3  221       1   all
## 234                                           woo         3  221       1   all
## 235                                           lin         3  221       1   all
## 236                                      pedersen         3  221       1   all
## 237                                       winston         3  221       1   all
## 238                                         chang         3  221       1   all
## 239                                        you’ve         3  221       3   all
## 240                                      analysis         3  221       3   all
## 241                                     describes         3  221       3   all
## 242                                           fit         3  221       3   all
## 243                                       rstudio         3  221       2   all
## 244                                      friendly         3  221       3   all
## 245                                     two-table         3  221       1   all
## 246                                  manipulation         3  221       2   all
## 247                                     providing         3  221       3   all
## 248                                        select         3  221       1   all
## 249                                       combine         3  221       3   all
## 250                                      backends         3  221       1   all
## 251                                        frames         3  221       2   all
## 252                                        engine         3  221       2   all
## 253                                    translates         3  221       1   all
## 254                                        duckdb         3  221       1   all
## 255                                            14         3  221       2   all
## 256                                    hair_color         3  221       1   all
## 257                                    skin_color         3  221       1   all
## 258                                         c-3po         3  221       1   all
## 259                                         r2-d2         3  221       1   all
## 260                                            96         3  221       1   all
## 261                                            32         3  221       1   all
## 262                                            87         3  221       1   all
## 263                                         darth         3  221       1   all
## 264                                            82         3  221       1   all
## 265                                           bmi         3  221       1   all
## 266                                           136         3  221       1   all
## 267                                          male         3  221       1   all
## 268                                             9         3  221       3   all
## 269                                        gungan         3  221       2   all
## 270                                            35         3  221       2   all
## 271                                     encounter         3  221       3   all
## 272                                       minimal         3  221       3   all
## 273                                    discussion         3  221       3   all
## 274                                   contributor         3  221       3   all
## 275                                         agree         3  221       3   all
## 276                                         abide         3  221       3   all
## 277                                         terms         3  221       3   all
## 278                                        nested         3  221       1   all
## 279                                         0.3.0         3  221       3   all
## 280                                         0.2.0         3  221       3   all
## 281                                         0.1.0         3  221       3   all
## 282                                         tools         3  221       3   all
## 283                                        spread         3  221       2   all
## 284                                         lists         3  221       3   all
## 285                                         frame         3  221       3   all
## 286                                       reshape         3  221       1   all
## 287                                   interactive         3  221       2   all
## 288                                          fast         3  221       3   all
## 289                                           csv         3  221       1   all
## 290                                           tsv         3  221       1   all
## 291                                          core         3  221       2   all
## 292                                          load         3  221       2   all
## 293                                           lag         3  221       2   all
## 294                                       formats         3  221       2   all
## 295                                       numeric         3  221       2   all
## 296                                       guesses         3  221       1   all
## 297                                          spec         3  221       1   all
## 298                                         false         3  221       3   all
## 299                                             7         3  221       2   all
## 300                                       default         3  221       2   all
## 301                                      compared         3  221       2   all
## 302                                      progress         3  221       2   all
## 303                                        locale         3  221       2   all
## 304                                     languages         3  221       2   all
## 305                                        easier         3  221       3   all
## 306                                       display         3  221       2   all
## 307                                    data.frame         3  221       1   all
## 308                                         don’t         3  221       2   all
## 309                                        change         3  221       2   all
## 310                                      patterns         3  221       1   all
## 311                                    str_subset         3  221       1   all
## 312                                     str_count         3  221       1   all
## 313                                      extracts         3  221       1   all
## 314                                       outputs         3  221       1   all
## 315                                   categorical         3  221       1   all
## 316                                          leap         3  221       1   all
## 317                                         times         3  221       1   all
## 318                                           sun         3  221       1   all
## 319                                           fri         3  221       1   all
## 320                                         4.0.0         2  320       2   all
## 321                                         3.3.0         2  320       2   all
## 322                                         3.0.0         2  320       2   all
## 323                                         2.1.0         2  320       2   all
## 324                                     aesthetic         2  320       1   all
## 325                                specifications         2  320       1   all
## 326                                     developer         2  320       2   all
## 327                                     extending         2  320       2   all
## 328                                      faceting         2  320       1   all
## 329                                       grammar         2  320       2   all
## 330                                          it’s         2  320       2   all
## 331                                           add         2  320       1   all
## 332                                    geom_point         2  320       1   all
## 333                                           mpg         2  320       2   all
## 334                                     behaviour         2  320       1   all
## 335                                    extensions         2  320       1   all
## 336                                       reading         2  320       2   all
## 337                                 visualization         2  320       1   all
## 338                                      chapters         2  320       1   all
## 339                                 comprehensive         2  320       2   all
## 340                                        follow         2  320       2   all
## 341                                      mastered         2  320       2   all
## 342                                   illustrates         2  320       2   all
## 343                                        kohske         2  320       1   all
## 344                                     takahashi         2  320       1   all
## 345                                         claus         2  320       1   all
## 346                                         wilke         2  320       1   all
## 347                                       hiroaki         2  320       1   all
## 348                                        yutani         2  320       1   all
## 349                                         dewey         2  320       1   all
## 350                                    dunnington         2  320       1   all
## 351                                          teun         2  320       1   all
## 352                                           van         2  320       1   all
## 353                                           den         2  320       1   all
## 354                                         brand         2  320       1   all
## 355                                         0.8.0         2  320       2   all
## 356                                         picks         2  320       1   all
## 357                                     summarise         2  320       1   all
## 358                                       arrange         2  320       1   all
## 359                                     naturally         2  320       2   all
## 360                                      group_by         2  320       1   all
## 361                                transformation         2  320       2   all
## 362                                         arrow         2  320       1   all
## 363                                     including         2  320       2   all
## 364                                        apache         2  320       1   all
## 365                                     in-memory         2  320       1   all
## 366                                        stored         2  320       1   all
## 367                                      database         2  320       2   all
## 368                                           sql         2  320       2   all
## 369                                        copies         2  320       2   all
## 370                                   translation         2  320       2   all
## 371                                    birth_year         2  320       1   all
## 372                                        gender         2  320       1   all
## 373                                           167         2  320       1   all
## 374                                            75         2  320       1   all
## 375                                          gold         2  320       1   all
## 376                                         ig-88         2  320       1   all
## 377                                           200         2  320       1   all
## 378                                           140         2  320       1   all
## 379                                         metal         2  320       1   all
## 380                                     homeworld         2  320       1   all
## 381                                         films         2  320       1   all
## 382                                      vehicles         2  320       1   all
## 383                                     starships         2  320       1   all
## 384                                          luke         2  320       1   all
## 385                                     skywalker         2  320       1   all
## 386                                         vader         2  320       1   all
## 387                                          leia         2  320       1   all
## 388                                        organa         2  320       1   all
## 389                                         light         2  320       2   all
## 390                                             ^         2  320       2   all
## 391                                           202         2  320       1   all
## 392                                            wh         2  320       2   all
## 393                                      kaminoan         2  320       2   all
## 394                                      mirialan         2  320       2   all
## 395                         community.rstudio.com         2  320       2   all
## 396                                        romain         2  320       1   all
## 397                                      françois         2  320       1   all
## 398                                      pivoting         2  320       1   all
## 399                                   rectangling         2  320       1   all
## 400                                         1.3.0         2  320       2   all
## 401                                         0.5.0         2  320       2   all
## 402                                         0.4.0         2  320       2   all
## 403                                   observation         2  320       1   all
## 404                                          cell         2  320       1   all
## 405                                      standard         2  320       2   all
## 406                                      converts         2  320       1   all
## 407                                    introduces         2  320       2   all
## 408                                        gather         2  320       1   all
## 409                                          nest         2  320       1   all
## 410                                      implicit         2  320       1   all
## 411                                      complete         2  320       2   all
## 412                                       replace         2  320       2   all
## 413                                      reshape2         2  320       1   all
## 414                                     iteration         2  320       2   all
## 415                                     reshaping         2  320       1   all
## 416                               implementations         2  320       2   all
## 417                                          melt         2  320       1   all
## 418                                      cleaning         2  320       2   all
## 419                                    maximilian         2  320       1   all
## 420                                       girlich         2  320       1   all
## 421                                         1.4.0         2  320       2   all
## 422                                         1.3.1         2  320       1   all
## 423                                   rectangular         2  320       1   all
## 424                                     delimited         2  320       1   all
## 425                               comma-separated         2  320       1   all
## 426                                 tab-separated         2  320       1   all
## 427                                         parse         2  320       1   all
## 428                                     conflicts         2  320       1   all
## 429                                             ✖         2  320       1   all
## 430                                         masks         2  320       1   all
## 431                                         stats         2  320       1   all
## 432                                         guess         2  320       1   all
## 433                                       matures         2  320       1   all
## 434                                 readr_example         2  320       1   all
## 435                                  chickens.csv         2  320       1   all
## 436                                     delimiter         2  320       1   all
## 437                                      retrieve         2  320       1   all
## 438                                       foghorn         2  320       1   all
## 439                                       leghorn         2  320       1   all
## 440                                          joke         2  320       1   all
## 441                                           jok         2  320       1   all
## 442                                           sky         2  320       1   all
## 443                                       falling         2  320       1   all
## 444                                        ginger         2  320       1   all
## 445                                            12         2  320       1   all
## 446                                        listen         2  320       1   all
## 447                                           die         2  320       1   all
## 448                                         chick         2  320       1   all
## 449                                       camilla         2  320       1   all
## 450                                          bawk         2  320       1   all
## 451                                          buck         2  320       1   all
## 452                                       ba-gawk         2  320       1   all
## 453                                         ernie         2  320       1   all
## 454                                         giant         2  320       1   all
## 455                                       captain         2  320       1   all
## 456                                          solo         2  320       1   all
## 457                                         cargo         2  320       1   all
## 458                                          hold         2  320       1   all
## 459                                       guessed         2  320       1   all
## 460                                         check         2  320       2   all
## 461                                          cols         2  320       1   all
## 462                                     col_types         2  320       1   all
## 463                                           fct         2  320       2   all
## 464                                         vroom         2  320       1   all
## 465                                        called         2  320       1   all
## 466                                  with_edition         2  320       1   all
## 467                                 local_edition         2  320       1   all
## 468                                  alternatives         2  320       1   all
## 469                                    parameters         2  320       1   all
## 470                                        header         2  320       1   all
## 471                                     depending         2  320       2   all
## 472                                 automatically         2  320       1   all
## 473                                       helpful         2  320       2   all
## 474                                   conventions         2  320       2   all
## 475                                           jim         2  320       1   all
## 476                                        hester         2  320       1   all
## 477                                      jennifer         2  320       1   all
## 478                                         bryan         2  320       1   all
## 479                                    functional         2  320       1   all
## 480                                            fp         2  320       1   all
## 481                                         split         2  320       1   all
## 482                                        mtcars         2  320       1   all
## 483                                            df         2  320       1   all
## 484                                       map_dbl         2  320       1   all
## 485                                             8         2  320       2   all
## 486                                    advantages         2  320       1   all
## 487                                          pipe         2  320       2   all
## 488                                        output         2  320       2   all
## 489                                       returns         2  320       1   all
## 490                                      position         2  320       2   all
## 491                                         track         2  320       2   all
## 492                                     comparing         2  320       1   all
## 493                                   data.frames         2  320       1   all
## 494                                         print         2  320       1   all
## 495                                        method         2  320       1   all
## 496                                       objects         2  320       2   all
## 497                                        object         2  320       2   all
## 498                                     as_tibble         2  320       1   all
## 499                                    2025-03-18         2  320       1   all
## 500                                    2025-03-17         2  320       1   all
## 501                                    2025-03-16         2  320       1   all
## 502                                            17         2  320       2   all
## 503                                      features         2  320       2   all
## 504                                       tribble         2  320       1   all
## 505                                           3.6         2  320       1   all
## 506                                           8.5         2  320       1   all
## 507                                       doesn’t         2  320       1   all
## 508                                         1.6.0         2  320       2   all
## 509                                         str_c         2  320       1   all
## 510                                       matches         2  320       1   all
## 511                                         vowel         2  320       1   all
## 512                                    str_detect         2  320       1   all
## 513                                    str_locate         2  320       1   all
## 514                                   str_extract         2  320       1   all
## 515                                     str_match         2  320       1   all
## 516                                   str_replace         2  320       1   all
## 517                                     str_split         2  320       1   all
## 518                                         fixed         2  320       2   all
## 519                                         exact         2  320       2   all
## 520                                    regexplain         2  320       1   all
## 521                                     resources         2  320       2   all
## 522                                install_github         2  320       2   all
## 523                                        result         2  320       1   all
## 524                                    fct_infreq         2  320       1   all
## 525                                      fct_lump         2  320       1   all
## 526                                         is.na         2  320       1   all
## 527                                         count         2  320       1   all
## 528                                      geom_bar         2  320       1   all
## 529                              stringsasfactors         2  320       1   all
## 530                                         zones         2  320       1   all
## 531                                       savings         2  320       1   all
## 532                                           ymd         2  320       1   all
## 533                                       ymd_hms         2  320       1   all
## 534                                           dmy         2  320       1   all
## 535                                           mdy         2  320       1   all
## 536                                         month         2  320       1   all
## 537                                          wday         2  320       1   all
## 538                                         label         2  320       1   all
## 539                                           mon         2  320       1   all
## 540                                           tue         2  320       1   all
## 541                                           thu         2  320       1   all
## 542                                           sat         2  320       1   all
## 543                                       with_tz         2  320       1   all
## 544                                      force_tz         2  320       1   all
## 545                                       america         2  320       1   all
## 546                                       chicago         2  320       1   all
## 547                                           cst         2  320       1   all
## 548                                       vitalie         2  320       1   all
## 549                                         spinu         2  320       1   all
## 550                                       garrett         2  320       1   all
## 551                                     grolemund         2  320       1   all
## 552                                  ggplot24.0.1         1  552       1   all
## 553                                         3.5.0         1  552       1   all
## 554                                         3.4.0         1  552       1   all
## 555                                         3.2.0         1  552       1   all
## 556                                         3.1.0         1  552       1   all
## 557                                     profiling         1  552       1   all
## 558                                          axes         1  552       1   all
## 559                                   customising         1  552       1   all
## 560                                    annotation         1  552       1   all
## 561                                      barplots         1  552       1   all
## 562                                 declaratively         1  552       1   all
## 563                                      creating         1  552       1   all
## 564                                    aesthetics         1  552       1   all
## 565                                     graphical         1  552       1   all
## 566                                    primitives         1  552       1   all
## 567                                         takes         1  552       1   all
## 568                                          care         1  552       1   all
## 569                                    succinctly         1  552       1   all
## 570                                      embodies         1  552       1   all
## 571                                          deep         1  552       1   all
## 572                                    philosophy         1  552       1   all
## 573                                 visualisation         1  552       1   all
## 574                                        supply         1  552       1   all
## 575                                       mapping         1  552       1   all
## 576                                        layers         1  552       1   all
## 577                                geom_histogram         1  552       1   all
## 578                                        scales         1  552       1   all
## 579                           scale_colour_brewer         1  552       1   all
## 580                                    facet_wrap         1  552       1   all
## 581                                    coordinate         1  552       1   all
## 582                                       systems         1  552       1   all
## 583                                         displ         1  552       1   all
## 584                                           hwy         1  552       1   all
## 585                                        colour         1  552       1   all
## 586                                         class         1  552       1   all
## 587                                     lifecycle         1  552       1   all
## 588                                      hundreds         1  552       1   all
## 589                                     thousands         1  552       1   all
## 590                                        people         1  552       1   all
## 591                                      millions         1  552       1   all
## 592                                         plots         1  552       1   all
## 593                                  by-and-large         1  552       1   all
## 594                                     arguments         1  552       1   all
## 595                                    compelling         1  552       1   all
## 596                                       reasons         1  552       1   all
## 597                                    innovation         1  552       1   all
## 598                                          rich         1  552       1   all
## 599                                     ecosystem         1  552       1   all
## 600                                    maintained         1  552       1   all
## 601  https://exts.ggplot2.tidyverse.org/gallery/.         1  552       1   all
## 602                                      learning         1  552       1   all
## 603                                      starting         1  552       1   all
## 604                                    systematic         1  552       1   all
## 605                                 documentation         1  552       1   all
## 606                                 communication         1  552       1   all
## 607                                         speed         1  552       1   all
## 608                                    essentials         1  552       1   all
## 609                                        online         1  552       1   all
## 610                                       webinar         1  552       1   all
## 611                                      plotting         1  552       1   all
## 612                                          dive         1  552       1   all
## 613                                      cookbook         1  552       1   all
## 614                                       recipes         1  552       1   all
## 615                                        basics         1  552       1   all
## 616                                       elegant         1  552       1   all
## 617                                   theoretical         1  552       1   all
## 618                                 underpinnings         1  552       1   all
## 619                                          book         1  552       1   all
## 620                                         helps         1  552       1   all
## 621                                    understand         1  552       1   all
## 622                                        theory         1  552       1   all
## 623                                     underpins         1  552       1   all
## 624                                      tailored         1  552       1   all
## 625                                 announcements         1  552       1   all
## 626                                    deep-dives         1  552       1   all
## 627                                         visit         1  552       1   all
## 628                                          blog         1  552       1   all
## 629                                         posit         1  552       1   all
## 630                                         stack         1  552       1   all
## 631                                      overflow         1  552       1   all
## 632                                       answers         1  552       1   all
## 633                                       created         1  552       1   all
## 634                                             🧩         1  552       1   all
## 635                                    dplyr1.1.4         1  552       1   all
## 636                                    automation         1  552       1   all
## 637                                   column-wise         1  552       1   all
## 638                                      row-wise         1  552       1   all
## 639                                         0.8.3         1  552       1   all
## 640                                         0.8.2         1  552       1   all
## 641                                         0.8.1         1  552       1   all
## 642                                         0.7.5         1  552       1   all
## 643                                    challenges         1  552       1   all
## 644                                          adds         1  552       1   all
## 645                                       reduces         1  552       1   all
## 646                                       perform         1  552       1   all
## 647                                     operation         1  552       1   all
## 648                                  single-table         1  552       1   all
## 649                                       variety         1  552       1   all
## 650                                      addition         1  552       1   all
## 651                                 computational         1  552       1   all
## 652                                    accessible         1  552       1   all
## 653                                     efficient         1  552       1   all
## 654                                   alternative         1  552       1   all
## 655                            larger-than-memory         1  552       1   all
## 656                                        remote         1  552       1   all
## 657                                         cloud         1  552       1   all
## 658                                       storage         1  552       1   all
## 659                                           aws         1  552       1   all
## 660                                            s3         1  552       1   all
## 661                                         acero         1  552       1   all
## 662                                        dtplyr         1  552       1   all
## 663                                        dbplyr         1  552       1   all
## 664                                    relational         1  552       1   all
## 665                                      duckplyr         1  552       1   all
## 666                                       queries         1  552       1   all
## 667                                     automatic         1  552       1   all
## 668                                      fallback         1  552       1   all
## 669                                         isn’t         1  552       1   all
## 670                                      sparklyr         1  552       1   all
## 671                                         spark         1  552       1   all
## 672                                       feature         1  552       1   all
## 673                                         cheat         1  552       1   all
## 674                                         sheet         1  552       1   all
## 675                                           112         1  552       1   all
## 676                                            33         1  552       1   all
## 677                                         r5-d4         1  552       1   all
## 678                                            97         1  552       1   all
## 679                                        r4-p17         1  552       1   all
## 680                                        silver         1  552       1   all
## 681                                      feminine         1  552       1   all
## 682                                     ends_with         1  552       1   all
## 683                                         color         1  552       1   all
## 684                                         blond         1  552       1   all
## 685                                          fair         1  552       1   all
## 686                                           100         1  552       1   all
## 687                                           172         1  552       1   all
## 688                                            77         1  552       1   all
## 689                                          26.0         1  552       1   all
## 690                                          26.9         1  552       1   all
## 691                                          34.7         1  552       1   all
## 692                                          33.3         1  552       1   all
## 693                                           150         1  552       1   all
## 694                                            49         1  552       1   all
## 695                                          21.8         1  552       1   all
## 696                                          desc         1  552       1   all
## 697                                         jabba         1  552       1   all
## 698                                           175         1  552       1   all
## 699                                          1358         1  552       1   all
## 700                                     green-tan         1  552       1   all
## 701                                        orange         1  552       1   all
## 702                                           600         1  552       1   all
## 703                                          herm         1  552       1   all
## 704                                      grievous         1  552       1   all
## 705                                           216         1  552       1   all
## 706                                           159         1  552       1   all
## 707                                         green         1  552       1   all
## 708                                          41.9         1  552       1   all
## 709                                       tarfful         1  552       1   all
## 710                                           234         1  552       1   all
## 711                                         na.rm         1  552       1   all
## 712                                            50         1  552       1   all
## 713                                          69.8         1  552       1   all
## 714                                            74         1  552       1   all
## 715                                          81.3         1  552       1   all
## 716                                            88         1  552       1   all
## 717                                          53.1         1  552       1   all
## 718                                         issue         1  552       1   all
## 719                                    manipulatr         1  552       1   all
## 720                                       mailing         1  552       1   all
## 721                                 participating         1  552       1   all
## 722                                    tidyr1.3.1         1  552       1   all
## 723                                         0.7.0         1  552       1   all
## 724                                         0.6.0         1  552       1   all
## 725                                       storing         1  552       1   all
## 726                                        ensure         1  552       1   all
## 727                                        you’ll         1  552       1   all
## 728                                         spend         1  552       1   all
## 729                                      fighting         1  552       1   all
## 730                                     tidy-data         1  552       1   all
## 731                                          fall         1  552       1   all
## 732                                    categories         1  552       1   all
## 733                                          wide         1  552       1   all
## 734                                         forms         1  552       1   all
## 735                                  pivot_longer         1  552       1   all
## 736                                   pivot_wider         1  552       1   all
## 737                                     replacing         1  552       1   all
## 738                                        deeply         1  552       1   all
## 739                                          json         1  552       1   all
## 740                                 unnest_longer         1  552       1   all
## 741                                  unnest_wider         1  552       1   all
## 742                                         hoist         1  552       1   all
## 743                                     rectangle         1  552       1   all
## 744                                       nesting         1  552       1   all
## 745                                          form         1  552       1   all
## 746                                     unnesting         1  552       1   all
## 747                                        unnest         1  552       1   all
## 748                                     splitting         1  552       1   all
## 749                                     combining         1  552       1   all
## 750                          separate_wider_delim         1  552       1   all
## 751                       separate_wider_position         1  552       1   all
## 752                          separate_wider_regex         1  552       1   all
## 753                                          pull         1  552       1   all
## 754                                         unite         1  552       1   all
## 755                                       drop_na         1  552       1   all
## 756                                      previous         1  552       1   all
## 757                                    replace_na         1  552       1   all
## 758                                    supersedes         1  552       1   all
## 759                                     2010-2014         1  552       1   all
## 760                                     2005-2010         1  552       1   all
## 761                            counterintuitively         1  552       1   all
## 762                                       tidying         1  552       1   all
## 763                                   aggregation         1  552       1   all
## 764                              high-performance         1  552       1   all
## 765                                         dcast         1  552       1   all
## 766                                   perspective         1  552       1   all
## 767                                           i’d         1  552       1   all
## 768                                        papers         1  552       1   all
## 769                                      wrangler         1  552       1   all
## 770                                        visual         1  552       1   all
## 771                                       scripts         1  552       1   all
## 772                                     framework         1  552       1   all
## 773                                      potter’s         1  552       1   all
## 774                                         wheel         1  552       1   all
## 775                                   efficiently         1  552       1   all
## 776                                  implementing         1  552       1   all
## 777                                     schemasql         1  552       1   all
## 778                                        here’s         1  552       1   all
## 779                                   terminology         1  552       1   all
## 780                                         wider         1  552       1   all
## 781                                          cast         1  552       1   all
## 782                                  spreadsheets         1  552       1   all
## 783                                       unpivot         1  552       1   all
## 784                                     databases         1  552       1   all
## 785                                          fold         1  552       1   all
## 786                                        unfold         1  552       1   all
## 787                                    readr2.1.6         1  552       1   all
## 788                                       locales         1  552       1   all
## 789                                          wild         1  552       1   all
## 790                                   informative         1  552       1   all
## 791                                         leads         1  552       1   all
## 792                                    unexpected         1  552       1   all
## 793                                        import         1  552       1   all
## 794                                     attaching         1  552       1   all
## 795                                         1.1.4         1  552       1   all
## 796                                    2.1.5.9000         1  552       1   all
## 797                                         1.0.1         1  552       1   all
## 798                                         1.5.2         1  552       1   all
## 799                                         1.9.4         1  552       1   all
## 800                           tidyverse_conflicts         1  552       1   all
## 801                                    conflicted         1  552       1   all
## 802                  http://conflicted.r-lib.org/         1  552       1   all
## 803                                         force         1  552       1   all
## 804                                        errors         1  552       1   all
## 805                                        parses         1  552       1   all
## 806                                         lines         1  552       1   all
## 807                                        fields         1  552       1   all
## 808                                      supports         1  552       1   all
## 809                                         read_         1  552       1   all
## 810                                      read_tsv         1  552       1   all
## 811                                     read_csv2         1  552       1   all
## 812                           semicolon-separated         1  552       1   all
## 813                                       decimal         1  552       1   all
## 814                                          mark         1  552       1   all
## 815                                    read_delim         1  552       1   all
## 816                                       special         1  552       1   all
## 817                                      read_fwf         1  552       1   all
## 818                                   fixed-width         1  552       1   all
## 819                                    read_table         1  552       1   all
## 820                          whitespace-separated         1  552       1   all
## 821                                      read_log         1  552       1   all
## 822                                           log         1  552       1   all
## 823                                     converted         1  552       1   all
## 824                                      specific         1  552       1   all
## 825                                      datetime         1  552       1   all
## 826                                       absence         1  552       1   all
## 827                                  column-types         1  552       1   all
## 828                                      guessing         1  552       1   all
## 829                                         handy         1  552       1   all
## 830                                   exploration         1  552       1   all
## 831                                      remember         1  552       1   all
## 832                                   exploratory         1  552       1   all
## 833                                         phase         1  552       1   all
## 834                                      strategy         1  552       1   all
## 835                                         loads         1  552       1   all
## 836                                        sample         1  552       1   all
## 837                                       bundled         1  552       1   all
## 838                                show_col_types         1  552       1   all
## 839                                         quiet         1  552       1   all
## 840                                       message         1  552       1   all
## 841                                        prints         1  552       1   all
## 842                                        expect         1  552       1   all
## 843                                       haven’t         1  552       1   all
## 844                                        sounds         1  552       1   all
## 845                                           lot         1  552       1   all
## 846                                       trouble         1  552       1   all
## 847                                       luckily         1  552       1   all
## 848                                       affords         1  552       1   all
## 849                                          nice         1  552       1   all
## 850                                      workflow         1  552       1   all
## 851                                       initial         1  552       1   all
## 852                                        effort         1  552       1   all
## 853                                    col_double         1  552       1   all
## 854                                         paste         1  552       1   all
## 855                                         tweak         1  552       1   all
## 856                                     expresses         1  552       1   all
## 857                                       desired         1  552       1   all
## 858                                       express         1  552       1   all
## 859                                       integer         1  552       1   all
## 860                                    col_factor         1  552       1   all
## 861                                   col_integer         1  552       1   all
## 862                                      expanded         1  552       1   all
## 863                                      editions         1  552       1   all
## 864                                          july         1  552       1   all
## 865                                          2021         1  552       1   all
## 866                                     so-called         1  552       1   all
## 867                                         calls         1  552       1   all
## 868                                      versions         1  552       1   all
## 869                                         prior         1  552       1   all
## 870                                        access         1  552       1   all
## 871                                    definition         1  552       1   all
## 872                                        that’s         1  552       1   all
## 873                                      continue         1  552       1   all
## 874                                       support         1  552       1   all
## 875                                     uniformly         1  552       1   all
## 876                                          plan         1  552       1   all
## 877                                    eventually         1  552       1   all
## 878                                     deprecate         1  552       1   all
## 879                                        remove         1  552       1   all
## 880                           actively-maintained         1  552       1   all
## 881                                   workarounds         1  552       1   all
## 882                                       offered         1  552       1   all
## 883                                     pragmatic         1  552       1   all
## 884                                         patch         1  552       1   all
## 885                                        legacy         1  552       1   all
## 886                                     temporary         1  552       1   all
## 887                                      solution         1  552       1   all
## 888                                  infelicities         1  552       1   all
## 889                                    identified         1  552       1   all
## 890                                  data.table’s         1  552       1   all
## 891                                   differences         1  552       1   all
## 892                                     discussed         1  552       1   all
## 893                                        naming         1  552       1   all
## 894                                        scheme         1  552       1   all
## 895                                     col_names         1  552       1   all
## 896                                    colclasses         1  552       1   all
## 897                                        faster         1  552       1   all
## 898                                      10x-100x         1  552       1   all
## 899                                         leave         1  552       1   all
## 900                                           bar         1  552       1   all
## 901                                       loading         1  552       1   all
## 902                                       current         1  552       1   all
## 903                                      override         1  552       1   all
## 904                                    us-centric         1  552       1   all
## 905                                      defaults         1  552       1   all
## 906                                        slower         1  552       1   all
## 907                                         heavy         1  552       1   all
## 908                                     basically         1  552       1   all
## 909                                     encourage         1  552       1   all
## 910                                       skipped         1  552       1   all
## 911                                tidyverse-wide         1  552       1   all
## 912                                     returning         1  552       1   all
## 913                                      approach         1  552       1   all
## 914                                        repair         1  552       1   all
## 915                                 mini-language         1  552       1   all
## 916                                     selection         1  552       1   all
## 917                              acknowledgements         1  552       1   all
## 918                                           joe         1  552       1   all
## 919                                         cheng         1  552       1   all
## 920                                        beauty         1  552       1   all
## 921                                 deterministic         1  552       1   all
## 922                                        finite         1  552       1   all
## 923                                      automata         1  552       1   all
## 924                                      teaching         1  552       1   all
## 925                                         write         1  552       1   all
## 926                                     tokenizer         1  552       1   all
## 927                                            jj         1  552       1   all
## 928                                       allaire         1  552       1   all
## 929                                       helping         1  552       1   all
## 930                                        design         1  552       1   all
## 931                                        extend         1  552       1   all
## 932                                          dirk         1  552       1   all
## 933                                  eddelbuettel         1  552       1   all
## 934                                        coming         1  552       1   all
## 935                                    purrr1.2.0         1  552       1   all
## 936                                         0.2.3         1  552       1   all
## 937                                      enhances         1  552       1   all
## 938                                           r’s         1  552       1   all
## 939                                       toolkit         1  552       1   all
## 940                                         heard         1  552       1   all
## 941                                        family         1  552       1   all
## 942                                         loops         1  552       1   all
## 943                                      succinct         1  552       1   all
## 944                                     realistic         1  552       1   all
## 945                                         model         1  552       1   all
## 946                                         piece         1  552       1   all
## 947                                       compute         1  552       1   all
## 948                                            r2         1  552       1   all
## 949                                             $         1  552       1   all
## 950                                           cyl         1  552       1   all
## 951                                            lm         1  552       1   all
## 952                                            wt         1  552       1   all
## 953                                     r.squared         1  552       1   all
## 954                                     0.5086326         1  552       1   all
## 955                                     0.4645102         1  552       1   all
## 956                                     0.4229655         1  552       1   all
## 957                                   equivalents         1  552       1   all
## 958                                   type-stable         1  552       1   all
## 959                                        return         1  552       1   all
## 960                                    advertised         1  552       1   all
## 961                                        double         1  552       1   all
## 962                                         throw         1  552       1   all
## 963                                         error         1  552       1   all
## 964                                        accept         1  552       1   all
## 965                                         named         1  552       1   all
## 966                                     anonymous         1  552       1   all
## 967                                        lambda         1  552       1   all
## 968                                       obvious         1  552       1   all
## 969                                       running         1  552       1   all
## 970                                          jobs         1  552       1   all
## 971                                   in_parallel         1  552       1   all
## 972                                   computation         1  552       1   all
## 973                                         cores         1  552       1   all
## 974                                      machines         1  552       1   all
## 975                                       network         1  552       1   all
## 976                                   tibble3.3.0         1  552       1   all
## 977                                   controlling         1  552       1   all
## 978                                    invariants         1  552       1   all
## 979                                      behavior         1  552       1   all
## 980                                         2.1.1         1  552       1   all
## 981                                         2.0.1         1  552       1   all
## 982                              pre-announcement         1  552       1   all
## 983                                         1.4.2         1  552       1   all
## 984                                         1.4.1         1  552       1   all
## 985                                        tbl_df         1  552       1   all
## 986                                        modern         1  552       1   all
## 987                                   reimagining         1  552       1   all
## 988                                       keeping         1  552       1   all
## 989                                        proven         1  552       1   all
## 990                                     effective         1  552       1   all
## 991                                      throwing         1  552       1   all
## 992                                          lazy         1  552       1   all
## 993                                         surly         1  552       1   all
## 994                                           i.e         1  552       1   all
## 995                                       partial         1  552       1   all
## 996                                      complain         1  552       1   all
## 997                                         exist         1  552       1   all
## 998                                        forces         1  552       1   all
## 999                                      confront         1  552       1   all
## 1000                                      earlier         1  552       1   all
## 1001                                    typically         1  552       1   all
## 1002                                      leading         1  552       1   all
## 1003                                      cleaner         1  552       1   all
## 1004                                   expressive         1  552       1   all
## 1005                                     enhanced         1  552       1   all
## 1006                                      complex         1  552       1   all
## 1007                                     sys.date         1  552       1   all
## 1008                                   reasonable         1  552       1   all
## 1009                                     matrices         1  552       1   all
## 1010                                       tables         1  552       1   all
## 1011                                           26         1  552       1   all
## 1012                                     recycles         1  552       1   all
## 1013                                      creates         1  552       1   all
## 1014                                    row.names         1  552       1   all
## 1015                                       define         1  552       1   all
## 1016                                   row-by-row         1  552       1   all
## 1017                                        draws         1  552       1   all
## 1018                                  inspiration         1  552       1   all
## 1019                                     rownames         1  552       1   all
## 1020                                   2.1.3.9000         1  552       1   all
## 1021                                 stringr1.6.0         1  552       1   all
## 1022                                    sensitive         1  552       1   all
## 1023                                        1.5.0         1  552       1   all
## 1024                                    glamorous         1  552       1   all
## 1025                                 high-profile         1  552       1   all
## 1026                                         play         1  552       1   all
## 1027                                         role         1  552       1   all
## 1028                                  preparation         1  552       1   all
## 1029                                        tasks         1  552       1   all
## 1030                                     cohesive         1  552       1   all
## 1031                                     familiar         1  552       1   all
## 1032                                          icu         1  552       1   all
## 1033                                      correct         1  552       1   all
## 1034                                manipulations         1  552       1   all
## 1035                                     focusses         1  552       1   all
## 1036                                     commonly         1  552       1   all
## 1037                                     covering         1  552       1   all
## 1038                                      imagine         1  552       1   all
## 1039                                        share         1  552       1   all
## 1040                                         str_         1  552       1   all
## 1041                                   str_length         1  552       1   all
## 1042                                     collapse         1  552       1   all
## 1043                                      str_sub         1  552       1   all
## 1044                                      concise         1  552       1   all
## 1045                                     language         1  552       1   all
## 1046                                   describing         1  552       1   all
## 1047                                   expression         1  552       1   all
## 1048                                        tells         1  552       1   all
## 1049                                      there’s         1  552       1   all
## 1050                                       counts         1  552       1   all
## 1051                                      defined         1  552       1   all
## 1052                                  parentheses         1  552       1   all
## 1053                                   characters         1  552       1   all
## 1054                                          vid         1  552       1   all
## 1055                                          ros         1  552       1   all
## 1056                                          dea         1  552       1   all
## 1057                                          aut         1  552       1   all
## 1058                                  replacement         1  552       1   all
## 1059                                     replaces         1  552       1   all
## 1060                                          deo         1  552       1   all
## 1061                                           ss         1  552       1   all
## 1062                                         xtra         1  552       1   all
## 1063                                     uthority         1  552       1   all
## 1064                                       splits         1  552       1   all
## 1065                                      engines         1  552       1   all
## 1066                                        bytes         1  552       1   all
## 1067                                         coll         1  552       1   all
## 1068                                     boundary         1  552       1   all
## 1069                                   boundaries         1  552       1   all
## 1070                                    interface         1  552       1   all
## 1071                                interactively         1  552       1   all
## 1072                                        build         1  552       1   all
## 1073                                       regexp         1  552       1   all
## 1074                                      consult         1  552       1   all
## 1075                                     included         1  552       1   all
## 1076                                    installed         1  552       1   all
## 1077                                    gadenbuie         1  552       1   all
## 1078                                        solid         1  552       1   all
## 1079                                        grown         1  552       1   all
## 1080                                  organically         1  552       1   all
## 1081                                 inconsistent         1  552       1   all
## 1082                                 additionally         1  552       1   all
## 1083                                         ruby         1  552       1   all
## 1084                                       python         1  552       1   all
## 1085                                       modify         1  552       1   all
## 1086                                  conjunction         1  552       1   all
## 1087                                      str_pad         1  552       1   all
## 1088                                           11         1  552       1   all
## 1089                                   simplifies         1  552       1   all
## 1090                                  eliminating         1  552       1   all
## 1091                                      options         1  552       1   all
## 1092                                           95         1  552       1   all
## 1093                                     produces         1  552       1   all
## 1094                                     includes         1  552       1   all
## 1095                                     ensuring         1  552       1   all
## 1096                                    from-base         1  552       1   all
## 1097                                         r4ds         1  552       1   all
## 1098                                 forcats1.0.1         1  552       1   all
## 1099                                       handle         1  552       1   all
## 1100                                      improve         1  552       1   all
## 1101                                        suite         1  552       1   all
## 1102                                     examples         1  552       1   all
## 1103                                      include         1  552       1   all
## 1104                                  fct_reorder         1  552       1   all
## 1105                                    frequency         1  552       1   all
## 1106                                  fct_relevel         1  552       1   all
## 1107                                         hand         1  552       1   all
## 1108                                   collapsing         1  552       1   all
## 1109                                     frequent         1  552       1   all
## 1110                                         sort         1  552       1   all
## 1111                                           37         1  552       1   all
## 1112                                      twi'lek         1  552       1   all
## 1113                                      wookiee         1  552       1   all
## 1114                                       zabrak         1  552       1   all
## 1115                                       aleena         1  552       1   all
## 1116                                     besalisk         1  552       1   all
## 1117                                           27         1  552       1   all
## 1118                                      history         1  552       1   all
## 1119                                 unauthorized         1  552       1   all
## 1120                                    biography         1  552       1   all
## 1121                                        roger         1  552       1   all
## 1122                                         peng         1  552       1   all
## 1123                                         sigh         1  552       1   all
## 1124                                       lumley         1  552       1   all
## 1125                                   approaches         1  552       1   all
## 1126                                    wrangling         1  552       1   all
## 1127                                       amelia         1  552       1   all
## 1128                                     mcnamara         1  552       1   all
## 1129                                     nicholas         1  552       1   all
## 1130                                       horton         1  552       1   all
## 1131                     https://forum.posit.co/.         1  552       1   all
## 1132                               lubridate1.9.4         1  552       1   all
## 1133                                        1.7.0         1  552       1   all
## 1134                                  frustrating         1  552       1   all
## 1135                                     commands         1  552       1   all
## 1136                                  unintuitive         1  552       1   all
## 1137                                      methods         1  552       1   all
## 1138                                       robust         1  552       1   all
## 1139                                         days         1  552       1   all
## 1140                                     daylight         1  552       1   all
## 1141                                       quirks         1  552       1   all
## 1142                                        lacks         1  552       1   all
## 1143                                 capabilities         1  552       1   all
## 1144                                   situations         1  552       1   all
## 1145                               warn.conflicts         1  552       1   all
## 1146                                      dmy_hms         1  552       1   all
## 1147                                     20101215         1  552       1   all
## 1148                                   2010-12-15         1  552       1   all
## 1149                                   2017-04-01         1  552       1   all
## 1150                                       simple         1  552       1   all
## 1151                                         mday         1  552       1   all
## 1152                                         hour         1  552       1   all
## 1153                                       minute         1  552       1   all
## 1154                                         1979         1  552       1   all
## 1155                                         2016         1  552       1   all
## 1156                                       helper         1  552       1   all
## 1157                                     handling         1  552       1   all
## 1158                                          utc         1  552       1   all
## 1159                                     printing         1  552       1   all
## 1160                                           09         1  552       1   all
## 1161                                      expands         1  552       1   all
## 1162                                 mathematical         1  552       1   all
## 1163                                    performed         1  552       1   all
## 1164                                         span         1  552       1   all
## 1165                                      classes         1  552       1   all
## 1166                                     borrowed         1  552       1   all
## 1167                         https://www.joda.org         1  552       1   all
## 1168                                    durations         1  552       1   all
## 1169                                      measure         1  552       1   all
## 1170                                      periods         1  552       1   all
## 1171                                   accurately         1  552       1   all
## 1172                                        clock         1  552       1   all
## 1173                                          day         1  552       1   all
## 1174                                    intervals         1  552       1   all
## 1175                                      protean         1  552       1   all
## 1176                                          gpl         1  552       1   all

6.8.8.1.3 Plot

We plot the top 15 most frequent words used in the text.

web_pages_txt_corpus_tok_no_punct_no_Stop_dfm %>% 
  textstat_frequency(n = 15) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL, y = "Frequency") +
  theme_minimal()

6.8.8.2 Lexical diversity

We can compute the lexical diversity in a document. This is a measure allowing us to provide a statistical account of diversity in the choice of lexical items in a text. See the different measures implemented here

6.8.8.2.1 TTR (Type-Token Ratio)

6.8.8.2.1.1 Computing TTR

web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_ttr <- textstat_lexdiv(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm, measure = "TTR")
head(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_ttr, 5)

##   document       TTR
## 1    text1 0.6180758
## 2    text2 0.4936709
## 3    text3 0.6146497
## 4    text4 0.5466238
## 5    text5 0.6635514

6.8.8.2.1.2 Plotting TTR

plot(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_ttr$TTR, type = "l", xaxt = "n", xlab = NULL, ylab = "TTR")
grid()
axis(1, at = seq_len(nrow(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_ttr)), labels = web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_ttr$document)

6.8.8.2.2 CTTR (Corrected Type-Token Ratio)

6.8.8.2.2.1 Computing CTTR

web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_cttr <- textstat_lexdiv(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm, measure = "CTTR")
head(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_cttr, 5)

##   document     CTTR
## 1    text1 8.094198
## 2    text2 7.599967
## 3    text3 7.701538
## 4    text4 9.639816
## 5    text5 6.863829

6.8.8.2.2.2 Plotting TTR

plot(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_cttr$CTTR, type = "l", xaxt = "n", xlab = NULL, ylab = "CTTR")
grid()
axis(1, at = seq_len(nrow(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_cttr)), labels = web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_cttr$document)

6.8.8.2.3 K (Yule’s K)

6.8.8.2.3.1 Computing K

web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_K <- textstat_lexdiv(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm, measure = "K")
head(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_K, 5)

##   document        K
## 1    text1 99.10836
## 2    text2 76.55468
## 3    text3 83.57337
## 4    text4 60.17308
## 5    text5 90.83763

6.8.8.2.3.2 Plotting K

plot(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_K$K, type = "l", xaxt = "n", xlab = NULL, ylab = expression(italic(K)))
grid()
axis(1, at = seq_len(nrow(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_K)), labels = web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_lexdiv_K$document)

6.8.8.3 Keyness - relative frequency analysis

The relative frequency analysis allows to provide a statistical analysis of frequent words as a function of a target reference level. For this dataset, we do not have a specific target. Hence the comparison is done based on the full dataset.

6.8.8.3.1 Computing keyness

web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_key <- textstat_keyness(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm)
head(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_key, 10)

##     feature      chi2            p n_target n_reference
## 1   ggplot2 234.98239 0.000000e+00       26           2
## 2       faq  61.00858 5.662137e-15        7           0
## 3  graphics  61.00858 5.662137e-15        7           0
## 4     chang  20.53116 5.866844e-06        3           0
## 5      kara  20.53116 5.866844e-06        3           0
## 6       lin  20.53116 5.866844e-06        3           0
## 7  pedersen  20.53116 5.866844e-06        3           0
## 8   winston  20.53116 5.866844e-06        3           0
## 9       woo  20.53116 5.866844e-06        3           0
## 10   author  19.34380 1.091731e-05       10          20

6.8.8.3.2 Plotting

textplot_keyness(web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_tstat_key, margin = 0.2)

6.8.8.4 Collocations - scoring multi-word expressions

A collocation analysis is a way to identify contiguous collocations of words, i.e., multi-word expressions. Depending on the language, these can be identified based on capitalisation (e.g., proper names) as in English texts. However, this is not the same across languages.

We look for capital letters in our text. The result provides Wald’s Lamda and z statistics. Usually, any z value higher or equal to 2 is statistically significant. To compute p values, we use the probability of a normal distribution based on a mean of 0 and an SD of 1. This is appended to the table.

web_pages_txt_corpus_tok_no_punct_no_Stop_tstat_col_caps <- tokens_select(web_pages_txt_corpus_tok_no_punct_no_Stop, pattern = c("^[A-Z]", "^[a-z]"), valuetype = "regex", case_insensitive = FALSE, padding = TRUE) %>%  textstat_collocations(min_count = 10) %>% mutate(p_value = 1 - pnorm(z, 0, 1))
web_pages_txt_corpus_tok_no_punct_no_Stop_tstat_col_caps

##      collocation count count_nested length    lambda         z      p_value
## 1        pak pak    12            0      2  7.082777 11.371583 0.000000e+00
## 2 hadley wickham    18            0      2 12.606207  6.260750 1.915649e-10
## 3   code conduct    13            0      2  8.624086  5.915232 1.657036e-09

6.8.8.5 Word clouds

We can use word clouds of the top 100 words

set.seed(132)
web_pages_txt_corpus_tok_no_punct_no_Stop_dfm %>% 
  textplot_wordcloud(max_words = 100, color = brewer.pal(8, "Dark2"))

6.8.8.6 Network of feature co-occurrences

A Network of feature co-occurrences allows to obtain association plot of word usage. We use an fcm (feature co-occurrence matrix) based on our DFM.

set.seed(144)
web_pages_txt_corpus_tok_no_punct_no_Stop_dfm %>% 
  dfm_trim(min_termfreq = 20) %>%
  textplot_network(min_freq = 0.5)

6.8.8.7 Poisson regression

Finally, we run a GLM with a poisson family to evaluate the significance level of our most frequent words.

6.8.8.7.1 Computing GLM

web_pages_txt_corpus_GLM <- web_pages_txt_corpus_tok_no_punct_no_Stop_dfm_freq %>% 
  filter(frequency >= 20) %>% 
  glm(frequency ~ feature, data = ., family = "poisson")
summary(web_pages_txt_corpus_GLM)

## 
## Call:
## glm(formula = frequency ~ feature, family = "poisson", data = .)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               4.5109     0.1048  43.031  < 2e-16 ***
## feature=                 -0.8733     0.1931  -4.521 6.14e-06 ***
## feature>                  1.0913     0.1212   9.007  < 2e-16 ***
## feature─                  0.3719     0.1363   2.730 0.006337 ** 
## feature1                 -0.2342     0.1577  -1.485 0.137597    
## feature2                 -0.6607     0.1796  -3.678 0.000235 ***
## feature3                 -0.8473     0.1914  -4.427 9.55e-06 ***
## feature4                 -1.2920     0.2258  -5.722 1.06e-08 ***
## feature5                 -1.3754     0.2334  -5.893 3.79e-09 ***
## featureauthor            -1.1097     0.2105  -5.271 1.36e-07 ***
## featurechr               -1.0769     0.2080  -5.178 2.24e-07 ***
## featurecode              -1.0451     0.2055  -5.085 3.67e-07 ***
## featurecolumn            -1.1787     0.2161  -5.454 4.93e-08 ***
## featuredata              -0.5035     0.1708  -2.948 0.003197 ** 
## featuredplyr             -1.4198     0.2376  -5.976 2.28e-09 ***
## featurefunctions         -1.1436     0.2132  -5.363 8.20e-08 ***
## featureggplot2           -1.1787     0.2161  -5.454 4.93e-08 ***
## featureinstall.packages  -1.2528     0.2224  -5.634 1.77e-08 ***
## featurelearn             -1.5151     0.2470  -6.135 8.51e-10 ***
## featurelicense           -1.2528     0.2224  -5.634 1.77e-08 ***
## featurereadr             -1.1436     0.2132  -5.363 8.20e-08 ***
## featuretibble            -1.0451     0.2055  -5.085 3.67e-07 ***
## featuretidyverse         -0.9845     0.2010  -4.898 9.68e-07 ***
## featureversion           -0.2068     0.1565  -1.321 0.186470    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 8.0439e+02  on 23  degrees of freedom
## Residual deviance: 3.5527e-15  on  0  degrees of freedom
## AIC: 180.61
## 
## Number of Fisher Scoring iterations: 3

6.8.8.7.2 Visualising coefficients

6.8.8.7.2.1 A plot

We use two functions from the package ggstats. Because we used a poisson distribution, we obtain the results in IRR (=Incident rate ratios). Usually, we need to exponentiate these to make sense of the results.

ggcoef_model(web_pages_txt_corpus_GLM, exponentiate = TRUE)

6.8.8.7.2.2 A plot + a table + 95% CI

ggcoef_table(web_pages_txt_corpus_GLM, exponentiate = TRUE)