HMM R
HMM R
Here is an example of a hidden markov model in R for part-of-speech (POS) tagging from Jurafsky & Martin’s Speech & Language Processing book:
library(pacman)
library(tidyverse)
library(magrittr)
p_load(HMM, TraMineR, seqHMM, data.table, psych, progress, glue, scales, Hmisc)
transitions <- matrix(
c(
0.3777, 8e-04, 0.0322, 0.0366, 0.0096, 0.0068, 0.1147,
0.011, 2e-04, 5e-04, 4e-04, 0.0176, 0.0102, 0.0021,
9e-04, 0.7968, 0.005, 1e-04, 0.0014, 0.1011, 2e-04,
0.0084, 5e-04, 0.0837, 0.0733, 0.0086, 0.1012, 0.2157,
0.0584, 8e-04, 0.0615, 0.4509, 0.1216, 0.012, 0.4744,
0.009, 0.1698, 0.0514, 0.0036, 0.0177, 0.0728, 0.0102,
0.0025, 0.0041, 0.2231, 0.0036, 0.0068, 0.0479, 0.0017
),
nrow=7, # from state
ncol=7, # to state
byrow = TRUE
) %>%
t()
transition_labels <- c("NNP", "MD", "VB", "JJ", "NN", "RB", "DT")
emissions <- matrix(
c(
0.000032,0,0,0.000048,0,
0,0.308431,0,0,0,
0,0.000028,0.000672,0,0.000028,
0,0,0.00034,0,0,
0,0.0002,0.000223,0,0.002337,
0,0,0.010446,0,0,
0,0,0,0.506099,0
),
nrow=7, # number of observations
ncol=5, # number of states
byrow = TRUE
)
emissions_labels <- c("Janet", "will", "back", "the", "bill") # Observation row names
Pi <- c(0.2767, 6e-04, 0.0031, 0.0453, 0.0449, 0.051, 0.2026) # States
# This function initialises a general discrete time and discrete space
# Hidden Markov Model (HMM). A HMM consists of an alphabet of states and
# emission symbols. A HMM assumes that the states are hidden from the observer,
# while only the emissions of the states are observable. The HMM is designed
# to make inference on the states through the observation of emissions.
# The stochastics of the HMM is fully described by the initial starting
# probabilities of the states, the transition probabilities between states
# and the emission probabilities of the states.
#
# States
# Vector with the names of the states.
# Symbols
# Vector with the names of the symbols.
# startProbs
# Vector with the starting probabilities of the states.
# transProbs
# Stochastic matrix containing the transition probabilities between the states.
# emissionProbs
# Stochastic matrix containing the emission probabilities of the states
# These are our observations:
Symbols <- emissions_labels
# These are our hidden states:
States <- transition_labels
hmm <- initHMM(States, # Hidden States
Symbols, # Symbols, or observations
transProbs = transitions,
emissionProbs = emissions,
startProbs = Pi)
hmm$States
hmm$Symbols
hmm$startProbs
hmm$transProbs
hmm$emissionProbs
# Calculate Viterbi path
viterbi_hidden_states <- viterbi(hmm, c('Janet', 'will', 'back', 'the', 'bill'))
print(viterbi_hidden_states)
# Expected output:
c('NNP', 'MD', 'VB', 'DT', 'NN')
# Evaluate viterbi-predicted hidden state sequence against actual hidden states
x <- data.frame(
viterbi_hidden_states = viterbi_hidden_states,
actual_hidden_states = c('NNP', 'MD', 'VB', 'DT', 'NN')
)
# Calculate cohen's kappa statistic, which is an inter-rater agreement score between two sequences:
psych::cohen.kappa(x, w=NULL, n.obs=NULL, alpha=.05, levels=NULL)
# Simulate from the HMM
simHMM(hmm, 4)
Here is an example of POS tagging with hand-tagged data from Wall Street Journal articles:
Preprocessed WSJ sequence data looks like:
token tag
1: <START> <START>
2: most JJS
3: banking NN
4: issues NNS
5: <OOV> <OOV>
6: after IN
7: a DT
8: sector NN
9: downgrade NN
10: by IN
11: <OOV> <OOV>
12: securities NNP
13: , ,
14: although IN
15: national NNP
16: <OOV> <OOV>
17: showed VBD
18: strength NN
19: on IN
20: positive JJ
21: comments NNS
22: from IN
23: brokerage NN
24: firms NNS
25: about IN
26: its PRP$
27: long-term JJ
28: prospects NNS
29: . .
30: <EOS> <EOS>
Start by reading in preprocessed CSV files:
wsj_train <- fread('wsj_train.csv', col.names = c('token','tag'), header = FALSE, skip = 1)
wsj_test <- fread('wsj_test.csv', col.names = c('token','tag'), header = FALSE, skip = 1)
n_tokens <- wsj_train$token %>% n_distinct()
# Handle OOV words:
token_freq <- wsj_train %>%
group_by(token) %>%
tally() %>%
mutate(freq = n/sum(n)) %>%
arrange(-n)
# Using proportion of tokens:
vocab <- token_freq %>% head(round(0.9*n_tokens)) %>% select(token)
vocab_size <- length(vocab$token)
#vocab_size # 12,816
oov_tokens <- token_freq %>% tail(n_tokens-round(0.9*n_tokens)) %>% select(token)
#oov_tokens
#length(oov_tokens$token) # 1,424
# Replace rare observation tokens with out-of-vocabulary token:
wsj_train_closed <- wsj_train
wsj_train_closed[wsj_train_closed$token %in% oov_tokens$token] <- '<OOV>'
# Do the same for test:
wsj_test_closed <- wsj_test
wsj_test_closed[!wsj_test_closed$token %in% vocab$token] <- '<OOV>'
# First steps are to create transitions, emissions,
# and starting/initial probability matrices:
# Transitions probabilities (from one state to the next state):
transitions <- wsj_train_closed %>%
select(tag) %>%
mutate(next_tag = lead(tag)) %>%
table()
transitions_probs <- transitions / rowSums(transitions)
rm(transitions) # Garbage collection - counts no longer needed
# Emission probabilities (probability of state given observation):
emissions <- wsj_train_closed %>%
select(token, tag) %>%
na.omit() %>%
table()
emissions_probs <- emissions / rowSums(emissions)
rm(emissions) # Garbage collection - counts no longer needed
# Initial probabilities
Pi <- transitions_probs['<START>',]
# Do the probability matrices sum to 1 per column/row?
#rowSums(emissions_probs) # all 1's
#rowSums(transitions_probs) # all 1's
#sum(Pi) # 1
# Initialise HMM
#initHMM(States, Symbols, startProbs=NULL, transProbs=NULL, emissionProbs=NULL)
# These are our observations:
Symbols <- row.names(emissions_probs)
# These are our hidden states:
States <- colnames(transitions_probs)
# This is our parameterized hidden markov model:
hmm <- initHMM(States, # Hidden States
Symbols, # Symbols, or observations
transProbs = transitions_probs,
emissionProbs = emissions_probs %>% t(),
startProbs = Pi)
#print(hmm)
hmm$States
hmm$Symbols
hmm$startProbs
hmm$transProbs
row.names(hmm$transProbs)
hmm$emissionProbs
row.names(hmm$emissionProbs)
# Simulate from the HMM
simHMM(hmm, 10)
# Going through one sample sequence tagging procedure:
# Sequence of observations
observations <- wsj_test_closed[2:30,]$token
actual_hidden_states <- wsj_test_closed[2:30,]$tag
# Calculate Viterbi path
viterbi_hidden_states <- viterbi(hmm, observations)
print(viterbi_hidden_states)
# Evaluate viterbi-predicted hidden state sequence against actual hidden states
x <- data.frame(
viterbi_hidden_states = viterbi_hidden_states,
actual_hidden_states = actual_hidden_states
)
# Calculating cohen's kappa score for inter-rater agreement to assess performance of the model:
kappa <- psych::cohen.kappa(x, w=NULL, n.obs=NULL, alpha=.05, levels=NULL)
kappa$kappa
kappa$weighted.kappa
# Going through multiple test set sequences:
# First, split test set into sequences, removing the <START> and <END>
# tokens/tags:
test_sequences <- wsj_test_closed
test_sequences$start <- ifelse(test_sequences$token == '<START>', 1, 0)
test_sequences$seq_id <- cumsum(test_sequences$start)
test_sequences %<>%
filter(
token %nin% c('<START>', '<EOS>')
) %>%
select(-start)
# With the prepared test sequences, loop through them and add the kappa scores to a list:
options(show.error.messages = FALSE)
k <- 100
#k <- max(test_sequences$seq_id)
viterbi_hidden_states_list <- list(length = k)
kappas <- list(length = k)
successes <- 0
#pb <- progress_bar$new(total = k)
pb <- progress_bar$new(format = "[:bar] :current/:total (:percent)", total = k)
for(i in 1:k){
pb$tick()
try(
{
sequences <- test_sequences %>% filter(seq_id == i)
actual_observations <- sequences$token
actual_hidden_states <- sequences$tag
viterbi_hidden_states <- HMM::viterbi(hmm, actual_observations)
viterbi_hidden_states_list[[i]] <- viterbi_hidden_states
x <- data.frame(
viterbi_hidden_states = viterbi_hidden_states,
actual_hidden_states = actual_hidden_states
)
kappa <- psych::cohen.kappa(x, w=NULL, n.obs=NULL, alpha=.05, levels=NULL)
kappas[[i]] <- kappa$weighted.kappa
successes <- successes + 1
}
,silent = TRUE
)
}
options(show.error.messages = TRUE)
successes
# Compare actual hidden states to the model's predictions:
viterbi_hidden_states_list[[1]]
test_sequences %>%
filter(seq_id == 1) %>%
select(tag)
kappas
avg_kappa <- mean(unlist(kappas))
print(glue("Average kappa score for hidden markov model was: {percent(round(avg_kappa,4))}"))