During this session, we will use whisper to
automatically transcribe an audio file. We will use the R package
reticulate to run Python code from within Rstudio. At the
end of the session, we will export the transcription to a
TextGrid file that can be opened in Praat for
further acoustic analysis.
First, make sure to have already installed and loaded the package
reticulate.
### Use the code below to check if you have all required packages installed. If some are not installed already, the code below will install these. If you have all packages installed, then you could load them with the second code.
requiredPackages = c('reticulate', 'tidyverse')
for(p in requiredPackages){
if(!require(p,character.only = TRUE)) install.packages(p, dependencies = TRUE)
library(p,character.only = TRUE)
}
options(ggrepel.max.overlaps = Inf)whisperMake sure to uncomment the line codes below if you haven’t installed
openai-whisper. To do so, we need to install it using the
function py_install from reticulate. This
needs to be done once, otherwise, the package will be installed again
(and this may take some time). We could use the new R package
audio.whisperthat is built within R. However, the first
time you use this package, you will notice that to transcribe the audio
file below, it will take you more than 20 minutes. All subsequent
running if the code will be relatively faster. In our case, we use the
original implementation of whisper in Python that we run
via Rstudio and the reticulate package, which is much faster.
whisperWe obtain various audio files from the audio.whisper package’s github
website, here
We will use the audio file jfk.wav which is a recording of
John F. Kennedy’s inaugural address in English.
We download the whisper model. In our case, we use the
base model as it provides better predictions than the
tiny model
whisper_model <- "base" # Size of the transcription model
# (Down)load the model
print("Loading the model")[1] "Loading the model"
[1] "Transcription started"
transcription <- model$transcribe(audio, language = "en", verbose = FALSE, word_timestamps = TRUE)
print("Transcription completed")[1] "Transcription completed"
$text
[1] " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country."
$segments
$segments[[1]]
$segments[[1]]$id
[1] 0
$segments[[1]]$seek
[1] 0
$segments[[1]]$start
[1] 0
$segments[[1]]$end
[1] 7.42
$segments[[1]]$text
[1] " And so my fellow Americans ask not what your country can do for you,"
$segments[[1]]$tokens
[1] 50364 400 370 452 7177 6280 1029 406 437
[10] 428 1941 393 360 337 291 11 50744
$segments[[1]]$temperature
[1] 0
$segments[[1]]$avg_logprob
[1] -0.3966229
$segments[[1]]$compression_ratio
[1] 1.341772
$segments[[1]]$no_speech_prob
[1] 0.09206953
$segments[[1]]$words
$segments[[1]]$words[[1]]
$segments[[1]]$words[[1]]$word
[1] " And"
$segments[[1]]$words[[1]]$start
[1] 0
$segments[[1]]$words[[1]]$end
[1] 0.52
$segments[[1]]$words[[1]]$probability
[1] 0.6432374
$segments[[1]]$words[[2]]
$segments[[1]]$words[[2]]$word
[1] " so"
$segments[[1]]$words[[2]]$start
[1] 0.52
$segments[[1]]$words[[2]]$end
[1] 0.84
$segments[[1]]$words[[2]]$probability
[1] 0.9871083
$segments[[1]]$words[[3]]
$segments[[1]]$words[[3]]$word
[1] " my"
$segments[[1]]$words[[3]]$start
[1] 0.84
$segments[[1]]$words[[3]]$end
[1] 1.18
$segments[[1]]$words[[3]]$probability
[1] 0.7963706
$segments[[1]]$words[[4]]
$segments[[1]]$words[[4]]$word
[1] " fellow"
$segments[[1]]$words[[4]]$start
[1] 1.18
$segments[[1]]$words[[4]]$end
[1] 1.56
$segments[[1]]$words[[4]]$probability
[1] 0.9954759
$segments[[1]]$words[[5]]
$segments[[1]]$words[[5]]$word
[1] " Americans"
$segments[[1]]$words[[5]]$start
[1] 1.56
$segments[[1]]$words[[5]]$end
[1] 2.1
$segments[[1]]$words[[5]]$probability
[1] 0.9258457
$segments[[1]]$words[[6]]
$segments[[1]]$words[[6]]$word
[1] " ask"
$segments[[1]]$words[[6]]$start
[1] 2.1
$segments[[1]]$words[[6]]$end
[1] 3.72
$segments[[1]]$words[[6]]$probability
[1] 0.2835915
$segments[[1]]$words[[7]]
$segments[[1]]$words[[7]]$word
[1] " not"
$segments[[1]]$words[[7]]$start
[1] 3.72
$segments[[1]]$words[[7]]$end
[1] 4.24
$segments[[1]]$words[[7]]$probability
[1] 0.6561064
$segments[[1]]$words[[8]]
$segments[[1]]$words[[8]]$word
[1] " what"
$segments[[1]]$words[[8]]$start
[1] 4.24
$segments[[1]]$words[[8]]$end
[1] 5.52
$segments[[1]]$words[[8]]$probability
[1] 0.7547233
$segments[[1]]$words[[9]]
$segments[[1]]$words[[9]]$word
[1] " your"
$segments[[1]]$words[[9]]$start
[1] 5.52
$segments[[1]]$words[[9]]$end
[1] 5.76
$segments[[1]]$words[[9]]$probability
[1] 0.9823318
$segments[[1]]$words[[10]]
$segments[[1]]$words[[10]]$word
[1] " country"
$segments[[1]]$words[[10]]$start
[1] 5.76
$segments[[1]]$words[[10]]$end
[1] 6.24
$segments[[1]]$words[[10]]$probability
[1] 0.9989085
$segments[[1]]$words[[11]]
$segments[[1]]$words[[11]]$word
[1] " can"
$segments[[1]]$words[[11]]$start
[1] 6.24
$segments[[1]]$words[[11]]$end
[1] 6.62
$segments[[1]]$words[[11]]$probability
[1] 0.9966052
$segments[[1]]$words[[12]]
$segments[[1]]$words[[12]]$word
[1] " do"
$segments[[1]]$words[[12]]$start
[1] 6.62
$segments[[1]]$words[[12]]$end
[1] 6.82
$segments[[1]]$words[[12]]$probability
[1] 0.9948519
$segments[[1]]$words[[13]]
$segments[[1]]$words[[13]]$word
[1] " for"
$segments[[1]]$words[[13]]$start
[1] 6.82
$segments[[1]]$words[[13]]$end
[1] 7.08
$segments[[1]]$words[[13]]$probability
[1] 0.9948372
$segments[[1]]$words[[14]]
$segments[[1]]$words[[14]]$word
[1] " you,"
$segments[[1]]$words[[14]]$start
[1] 7.08
$segments[[1]]$words[[14]]$end
[1] 7.42
$segments[[1]]$words[[14]]$probability
[1] 0.9964211
$segments[[2]]
$segments[[2]]$id
[1] 1
$segments[[2]]$seek
[1] 0
$segments[[2]]$start
[1] 7.42
$segments[[2]]$end
[1] 10.36
$segments[[2]]$text
[1] " ask what you can do for your country."
$segments[[2]]$tokens
[1] 50744 1029 437 291 393 360 337 428 1941
[10] 13 50894
$segments[[2]]$temperature
[1] 0
$segments[[2]]$avg_logprob
[1] -0.3966229
$segments[[2]]$compression_ratio
[1] 1.341772
$segments[[2]]$no_speech_prob
[1] 0.09206953
$segments[[2]]$words
$segments[[2]]$words[[1]]
$segments[[2]]$words[[1]]$word
[1] " ask"
$segments[[2]]$words[[1]]$start
[1] 7.42
$segments[[2]]$words[[1]]$end
[1] 8.46
$segments[[2]]$words[[1]]$probability
[1] 0.9024832
$segments[[2]]$words[[2]]
$segments[[2]]$words[[2]]$word
[1] " what"
$segments[[2]]$words[[2]]$start
[1] 8.46
$segments[[2]]$words[[2]]$end
[1] 8.76
$segments[[2]]$words[[2]]$probability
[1] 0.9623243
$segments[[2]]$words[[3]]
$segments[[2]]$words[[3]]$word
[1] " you"
$segments[[2]]$words[[3]]$start
[1] 8.76
$segments[[2]]$words[[3]]$end
[1] 9.04
$segments[[2]]$words[[3]]$probability
[1] 0.9864706
$segments[[2]]$words[[4]]
$segments[[2]]$words[[4]]$word
[1] " can"
$segments[[2]]$words[[4]]$start
[1] 9.04
$segments[[2]]$words[[4]]$end
[1] 9.32
$segments[[2]]$words[[4]]$probability
[1] 0.9966297
$segments[[2]]$words[[5]]
$segments[[2]]$words[[5]]$word
[1] " do"
$segments[[2]]$words[[5]]$start
[1] 9.32
$segments[[2]]$words[[5]]$end
[1] 9.56
$segments[[2]]$words[[5]]$probability
[1] 0.9923227
$segments[[2]]$words[[6]]
$segments[[2]]$words[[6]]$word
[1] " for"
$segments[[2]]$words[[6]]$start
[1] 9.56
$segments[[2]]$words[[6]]$end
[1] 9.78
$segments[[2]]$words[[6]]$probability
[1] 0.9964545
$segments[[2]]$words[[7]]
$segments[[2]]$words[[7]]$word
[1] " your"
$segments[[2]]$words[[7]]$start
[1] 9.78
$segments[[2]]$words[[7]]$end
[1] 9.94
$segments[[2]]$words[[7]]$probability
[1] 0.9963707
$segments[[2]]$words[[8]]
$segments[[2]]$words[[8]]$word
[1] " country."
$segments[[2]]$words[[8]]$start
[1] 9.94
$segments[[2]]$words[[8]]$end
[1] 10.36
$segments[[2]]$words[[8]]$probability
[1] 0.9990544
$language
[1] "en"
[[1]]
[[1]]$id
[1] 0
[[1]]$seek
[1] 0
[[1]]$start
[1] 0
[[1]]$end
[1] 7.42
[[1]]$text
[1] " And so my fellow Americans ask not what your country can do for you,"
[[1]]$tokens
[1] 50364 400 370 452 7177 6280 1029 406 437
[10] 428 1941 393 360 337 291 11 50744
[[1]]$temperature
[1] 0
[[1]]$avg_logprob
[1] -0.3966229
[[1]]$compression_ratio
[1] 1.341772
[[1]]$no_speech_prob
[1] 0.09206953
[[1]]$words
[[1]]$words[[1]]
[[1]]$words[[1]]$word
[1] " And"
[[1]]$words[[1]]$start
[1] 0
[[1]]$words[[1]]$end
[1] 0.52
[[1]]$words[[1]]$probability
[1] 0.6432374
[[1]]$words[[2]]
[[1]]$words[[2]]$word
[1] " so"
[[1]]$words[[2]]$start
[1] 0.52
[[1]]$words[[2]]$end
[1] 0.84
[[1]]$words[[2]]$probability
[1] 0.9871083
[[1]]$words[[3]]
[[1]]$words[[3]]$word
[1] " my"
[[1]]$words[[3]]$start
[1] 0.84
[[1]]$words[[3]]$end
[1] 1.18
[[1]]$words[[3]]$probability
[1] 0.7963706
[[1]]$words[[4]]
[[1]]$words[[4]]$word
[1] " fellow"
[[1]]$words[[4]]$start
[1] 1.18
[[1]]$words[[4]]$end
[1] 1.56
[[1]]$words[[4]]$probability
[1] 0.9954759
[[1]]$words[[5]]
[[1]]$words[[5]]$word
[1] " Americans"
[[1]]$words[[5]]$start
[1] 1.56
[[1]]$words[[5]]$end
[1] 2.1
[[1]]$words[[5]]$probability
[1] 0.9258457
[[1]]$words[[6]]
[[1]]$words[[6]]$word
[1] " ask"
[[1]]$words[[6]]$start
[1] 2.1
[[1]]$words[[6]]$end
[1] 3.72
[[1]]$words[[6]]$probability
[1] 0.2835915
[[1]]$words[[7]]
[[1]]$words[[7]]$word
[1] " not"
[[1]]$words[[7]]$start
[1] 3.72
[[1]]$words[[7]]$end
[1] 4.24
[[1]]$words[[7]]$probability
[1] 0.6561064
[[1]]$words[[8]]
[[1]]$words[[8]]$word
[1] " what"
[[1]]$words[[8]]$start
[1] 4.24
[[1]]$words[[8]]$end
[1] 5.52
[[1]]$words[[8]]$probability
[1] 0.7547233
[[1]]$words[[9]]
[[1]]$words[[9]]$word
[1] " your"
[[1]]$words[[9]]$start
[1] 5.52
[[1]]$words[[9]]$end
[1] 5.76
[[1]]$words[[9]]$probability
[1] 0.9823318
[[1]]$words[[10]]
[[1]]$words[[10]]$word
[1] " country"
[[1]]$words[[10]]$start
[1] 5.76
[[1]]$words[[10]]$end
[1] 6.24
[[1]]$words[[10]]$probability
[1] 0.9989085
[[1]]$words[[11]]
[[1]]$words[[11]]$word
[1] " can"
[[1]]$words[[11]]$start
[1] 6.24
[[1]]$words[[11]]$end
[1] 6.62
[[1]]$words[[11]]$probability
[1] 0.9966052
[[1]]$words[[12]]
[[1]]$words[[12]]$word
[1] " do"
[[1]]$words[[12]]$start
[1] 6.62
[[1]]$words[[12]]$end
[1] 6.82
[[1]]$words[[12]]$probability
[1] 0.9948519
[[1]]$words[[13]]
[[1]]$words[[13]]$word
[1] " for"
[[1]]$words[[13]]$start
[1] 6.82
[[1]]$words[[13]]$end
[1] 7.08
[[1]]$words[[13]]$probability
[1] 0.9948372
[[1]]$words[[14]]
[[1]]$words[[14]]$word
[1] " you,"
[[1]]$words[[14]]$start
[1] 7.08
[[1]]$words[[14]]$end
[1] 7.42
[[1]]$words[[14]]$probability
[1] 0.9964211
[[1]]
[[1]]$id
[1] 1
[[1]]$seek
[1] 0
[[1]]$start
[1] 7.42
[[1]]$end
[1] 10.36
[[1]]$text
[1] " ask what you can do for your country."
[[1]]$tokens
[1] 50744 1029 437 291 393 360 337 428 1941
[10] 13 50894
[[1]]$temperature
[1] 0
[[1]]$avg_logprob
[1] -0.3966229
[[1]]$compression_ratio
[1] 1.341772
[[1]]$no_speech_prob
[1] 0.09206953
[[1]]$words
[[1]]$words[[1]]
[[1]]$words[[1]]$word
[1] " ask"
[[1]]$words[[1]]$start
[1] 7.42
[[1]]$words[[1]]$end
[1] 8.46
[[1]]$words[[1]]$probability
[1] 0.9024832
[[1]]$words[[2]]
[[1]]$words[[2]]$word
[1] " what"
[[1]]$words[[2]]$start
[1] 8.46
[[1]]$words[[2]]$end
[1] 8.76
[[1]]$words[[2]]$probability
[1] 0.9623243
[[1]]$words[[3]]
[[1]]$words[[3]]$word
[1] " you"
[[1]]$words[[3]]$start
[1] 8.76
[[1]]$words[[3]]$end
[1] 9.04
[[1]]$words[[3]]$probability
[1] 0.9864706
[[1]]$words[[4]]
[[1]]$words[[4]]$word
[1] " can"
[[1]]$words[[4]]$start
[1] 9.04
[[1]]$words[[4]]$end
[1] 9.32
[[1]]$words[[4]]$probability
[1] 0.9966297
[[1]]$words[[5]]
[[1]]$words[[5]]$word
[1] " do"
[[1]]$words[[5]]$start
[1] 9.32
[[1]]$words[[5]]$end
[1] 9.56
[[1]]$words[[5]]$probability
[1] 0.9923227
[[1]]$words[[6]]
[[1]]$words[[6]]$word
[1] " for"
[[1]]$words[[6]]$start
[1] 9.56
[[1]]$words[[6]]$end
[1] 9.78
[[1]]$words[[6]]$probability
[1] 0.9964545
[[1]]$words[[7]]
[[1]]$words[[7]]$word
[1] " your"
[[1]]$words[[7]]$start
[1] 9.78
[[1]]$words[[7]]$end
[1] 9.94
[[1]]$words[[7]]$probability
[1] 0.9963707
[[1]]$words[[8]]
[[1]]$words[[8]]$word
[1] " country."
[[1]]$words[[8]]$start
[1] 9.94
[[1]]$words[[8]]$end
[1] 10.36
[[1]]$words[[8]]$probability
[1] 0.9990544
textDF1Words <- textDF1[,c(11:66)]
textDF1Words <- textDF1Words %>%
rename_with(~str_remove(., 'words.')) %>%
rename(word.0 = word,
start.0 = start,
end.0 = end,
probability.0 = probability)
colnames(textDF1Words) <- str_replace(colnames(textDF1Words), "\\d+",
function(x) sprintf("%02d", as.integer(x)))
textDF1WordstextDF1Words <- textDF1Words %>%
pivot_longer(cols= starts_with(c("word", "start", "end", "probability")),
names_to = c(".value", "limit"),
names_pattern = "(.*)(..)$") %>%
rename(word = word.,
start = start.,
end = end.,
probability = probability.)
textDF1WordstextDF2Words <- textDF2[,c(11:42)]
textDF2Words <- textDF2Words %>%
rename_with(~str_remove(., 'words.')) %>%
rename(word.0 = word,
start.0 = start,
end.0 = end,
probability.0 = probability)
colnames(textDF2Words) <- str_replace(colnames(textDF2Words), "\\d+",
function(x) sprintf("%02d", as.integer(x)))
textDF2WordstextDF2Words <- textDF2Words %>%
pivot_longer(cols= starts_with(c("word", "start", "end", "probability")),
names_to = c(".value", "limit"),
names_pattern = "(.*)(..)$") %>%
rename(word = word.,
start = start.,
end = end.,
probability = probability.)
textDF2Wordsreticulate::repl_python()
import csv
import textgrid # install with pip intall textgrid
# Load the CSV data
with open("words_whisper.csv",
"r", encoding="utf-8") as f:
reader = csv.DictReader(f,
delimiter=","
)
data = [row for row in reader]
# Create a TextGrid object
tg = textgrid.TextGrid()
# Create IntervalTier objects
transcript_tier = textgrid.IntervalTier(name="word")
# Populate the interval tiers
for row in data:
start_time = float(row["start"])
end_time = float(row["end"])
transcript_tier.add(start_time, end_time, row["word"])
# Add the interval tiers to the TextGrid
tg.append(transcript_tier)
# Write the TextGrid to a file
with open("words_whisper.TextGrid", "w", encoding="utf-8") as f:
tg.write(f)