9 Prediction
INCOMPLETE DRAFT
…
The essential questions for this chapter are:
- …
- …
- …
In this chapter we will
Orientation to the question(s) and dataset(s) to be explored …
wricle_df <- read_csv(file = "resources/10-prediction/data/derived/wricle_formal_curated.csv")
locness_df <- read_csv(file = "resources/10-prediction/data/derived/locness_curated.csv")
learners_df <- wricle_df %>%
filter(native_language == "Spanish") %>%
mutate(student = "Spanish") %>%
mutate(type = "Learner") %>%
select(essay_id = id, type, student, essay)
glimpse(learners_df)
#> Rows: 689
#> Columns: 4
#> $ essay_id <chr> "A1-1", "A1-2", "A10-1", "A10-2", "A101-1", "A101-2", "A101-3…
#> $ type <chr> "Learner", "Learner", "Learner", "Learner", "Learner", "Learn…
#> $ student <chr> "Spanish", "Spanish", "Spanish", "Spanish", "Spanish", "Spani…
#> $ essay <chr> "In our present society gay people is asking for the same rig…
natives_df <- locness_df %>%
unite(col = doc_id, c("file_id", "essay_id"), sep = "-") %>%
mutate(doc_id = str_replace(doc_id, "\\.txt", "")) %>%
mutate(type = "Native") %>%
select(essay_id = doc_id, type, student, essay)
glimpse(natives_df)
#> Rows: 411
#> Columns: 4
#> $ essay_id <chr> "alevels1-1", "alevels1-2", "alevels1-3", "alevels1-4", "alev…
#> $ type <chr> "Native", "Native", "Native", "Native", "Native", "Native", "…
#> $ student <chr> "British", "British", "British", "British", "British", "Briti…
#> $ essay <chr> "The basic dilema facing the UK's rail and road transport sys…
nativeness_df <- rbind(learners_df, natives_df) # combine
nativeness_df %>%
unnest_tokens(output = "word", input = "essay") %>%
count(word, type) %>%
group_by(type) %>%
summarize(total_words = sum(n))
#> # A tibble: 2 × 2
#> type total_words
#> <chr> <int>
#> 1 Learner 640535
#> 2 Native 324269
nativeness_corpus <- nativeness_df %>%
corpus(text_field = "essay")
nativeness_corpus_summary <- nativeness_corpus %>%
summary(n = ndoc(nativeness_corpus))
nativeness_corpus_summary %>%
slice_head(n = 10)
#> Corpus consisting of 1100 documents, showing 1100 documents:
#>
#> Text Types Tokens Sentences essay_id type student
#> text1 210 496 26 A1-1 Learner Spanish
#> text2 253 655 29 A1-2 Learner Spanish
#> text3 280 722 30 A10-1 Learner Spanish
#> text4 177 389 15 A10-2 Learner Spanish
#> text5 245 636 19 A101-1 Learner Spanish
#> text6 219 692 17 A101-2 Learner Spanish
#> text7 213 615 16 A101-3 Learner Spanish
#> text8 319 914 23 A101-4 Learner Spanish
#> text9 253 654 19 A102-1 Learner Spanish
#> text10 310 816 21 A102-2 Learner Spanish
nativeness_corpus$doc_id <- 1:ndoc(nativeness_corpus)
nativeness_corpus %>%
docvars() %>%
slice_head(n = 5)
#> essay_id type student doc_id
#> 1 A1-1 Learner Spanish 1
#> 2 A1-2 Learner Spanish 2
#> 3 A10-1 Learner Spanish 3
#> 4 A10-2 Learner Spanish 4
#> 5 A101-1 Learner Spanish 5
nativeness_tokens <- nativeness_corpus %>%
tokens(what = "word", remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
nativeness_tokens %>%
tokens_group(groups = type) %>%
head()
#> Tokens consisting of 2 documents and 1 docvar.
#> Learner :
#> [1] "In" "our" "present" "society" "gay" "people" "is"
#> [8] "asking" "for" "the" "same" "rights"
#> [ ... and 636,407 more ]
#>
#> Native :
#> [1] "The" "basic" "dilema" "facing" "the" "UK's"
#> [7] "rail" "and" "road" "transport" "system" "is"
#> [ ... and 321,936 more ]
nativeness_dfm <- nativeness_tokens %>%
dfm()
nativeness_dfm %>%
head(n = 5)
#> Document-feature matrix of: 5 documents, 28,129 features (99.24% sparse) and 4 docvars.
#> features
#> docs in our present society gay people is asking for the
#> text1 12 5 4 5 5 10 12 1 3 23
#> text2 26 2 1 4 0 10 14 0 1 45
#> text3 12 4 0 4 9 16 12 0 0 12
#> text4 15 1 0 1 0 9 7 0 1 10
#> text5 15 1 1 2 0 7 17 0 6 40
#> [ reached max_nfeat ... 28,119 more features ]
nativeness_dfm %>%
dfm_group(groups = type) %>%
head(n = 5)
#> Document-feature matrix of: 2 documents, 28,129 features (33.74% sparse) and 1 docvar.
#> features
#> docs in our present society gay people is asking for the
#> Learner 15518 1279 322 1269 520 5980 12931 33 4993 36728
#> Native 6357 584 109 422 6 1569 6307 12 3144 21090
#> [ reached max_nfeat ... 28,119 more features ]
nativeness_dfm %>%
topfeatures(n = 25)
#> the of to and in that a is it this
#> 57818 32821 31155 24158 21875 20847 19981 19238 11259 10357
#> be are they not for as people have with or
#> 10096 10085 8438 8196 8137 7977 7549 7506 6007 5568
#> their i on by because
#> 5192 5128 4820 4579 4511
nativeness_dfm %>%
textstat_frequency(n = 5, groups = type)
#> feature frequency rank docfreq group
#> 1 the 36728 1 689 Learner
#> 2 of 22095 2 689 Learner
#> 3 to 20417 3 689 Learner
#> 4 that 15929 4 689 Learner
#> 5 and 15836 5 689 Learner
#> 6 the 21090 1 410 Native
#> 7 to 10738 2 410 Native
#> 8 of 10726 3 410 Native
#> 9 and 8322 4 409 Native
#> 10 a 6827 5 410 Native
nativeness_dfm %>%
dfm_tfidf() %>%
textstat_frequency(n = 5, groups = type, force = TRUE)
#> feature frequency rank docfreq group
#> 1 abortion 1108 1 69 Learner
#> 2 marijuana 901 2 46 Learner
#> 3 education 746 3 204 Learner
#> 4 children 735 4 317 Learner
#> 5 sex 658 5 133 Learner
#> 6 he 803 1 199 Native
#> 7 his 573 2 194 Native
#> 8 candide 522 3 27 Native
#> 9 quote 402 4 26 Native
#> 10 caligula 394 5 11 Native
set.seed(4321) # make reproducible
num_docs <-
nativeness_dfm %>%
ndoc()
train_size <-
(num_docs * .75) %>% # get size of sample
round() # round to nearest whole number
train_ids <- sample(x = 1:num_docs, # population
size = train_size, # size of sample
replace = FALSE) # without replacement
nativeness_dfm_train <- nativeness_dfm %>%
dfm_subset(doc_id %in% train_ids)
nativeness_dfm_test <- nativeness_dfm %>%
dfm_subset(!doc_id %in% train_ids)
nativeness_dfm %>%
docvars() %>%
janitor::tabyl(type)
#> type n percent
#> Learner 689 0.626
#> Native 411 0.374
nativeness_dfm_train %>%
docvars() %>%
janitor::tabyl(type)
#> type n percent
#> Learner 516 0.625
#> Native 309 0.375
nativeness_dfm_test %>%
docvars() %>%
janitor::tabyl(type)
#> type n percent
#> Learner 173 0.629
#> Native 102 0.371
nb1 <- textmodel_nb(x = nativeness_dfm_train, y = nativeness_dfm_train$type)
summary(nb1)
#>
#> Call:
#> textmodel_nb.dfm(x = nativeness_dfm_train, y = nativeness_dfm_train$type)
#>
#> Class Priors:
#> (showing first 2 elements)
#> Learner Native
#> 0.5 0.5
#>
#> Estimated Feature Scores:
#> in our present society gay people is asking
#> Learner 0.0230 0.00186 0.000476 0.00193 7.62e-04 0.00889 0.0193 5.76e-05
#> Native 0.0174 0.00163 0.000315 0.00114 2.22e-05 0.00460 0.0176 2.96e-05
#> for the same rights that heterosexual some of
#> Learner 0.00747 0.0544 0.001318 0.000909 0.0238 3.06e-04 0.00476 0.0328
#> Native 0.00886 0.0582 0.000563 0.000252 0.0140 1.11e-05 0.00157 0.0298
#> this are to be allowed get married and
#> Learner 0.01148 0.01114 0.0300 0.01023 0.000326 0.001054 3.67e-04 0.0233
#> Native 0.00796 0.00745 0.0296 0.00932 0.000278 0.000763 8.52e-05 0.0224
#> adopt children request has been accepted
#> Learner 2.64e-04 0.00268 1.39e-05 0.00414 0.00187 0.000137
#> Native 3.71e-05 0.00119 2.22e-05 0.00435 0.00213 0.000141
coef(nb1) %>% head()
#> Learner Native
#> in 0.023045 1.74e-02
#> our 0.001856 1.63e-03
#> present 0.000476 3.15e-04
#> society 0.001929 1.14e-03
#> gay 0.000762 2.22e-05
#> people 0.008893 4.60e-03
predict(nb1, type = "prob") %>% # get the predicted document scores
tail # preview predicted probability scores
#> Learner Native
#> text1092 2.37e-164 1
#> text1094 3.18e-28 1
#> text1095 8.53e-49 1
#> text1096 1.83e-37 1
#> text1097 7.45e-39 1
#> text1098 2.35e-55 1
nb1_predictions <-
predict(nb1, type = "prob") %>% # get the predicted document scores
as.data.frame() %>% # convert to data frame
mutate(document = rownames(.)) %>% # add the document names to the data frame
as_tibble() %>% # convert to tibble
pivot_longer(cols = c("Learner", "Native"), # convert from wide to long format
names_to = "prediction", # new column for ham/spam predictions
values_to = "probability") %>% # probablity scores for each
group_by(document) %>% # group parameter by document
slice_max(probability, n = 1) %>% # keep the document row with highest probablity
slice_head(n = 1) %>% # for predictions that were 50/50
ungroup() %>% # remove grouping parameter
mutate(doc_id = str_remove(document, "text") %>% as.numeric) %>% # clean up document column so it matches doc_id in
arrange(doc_id) # order by doc_id
nb1_predictions %>%
slice_head(n = 10) # preview
#> # A tibble: 10 × 4
#> document prediction probability doc_id
#> <chr> <chr> <dbl> <dbl>
#> 1 text1 Learner 1 1
#> 2 text3 Learner 1 3
#> 3 text4 Learner 1 4
#> 4 text5 Learner 1 5
#> 5 text6 Learner 1 6
#> 6 text8 Learner 1 8
#> 7 text9 Learner 1 9
#> 8 text11 Learner 1 11
#> 9 text12 Learner 1 12
#> 10 text13 Learner 1 13
nb1_predictions_actual <-
cbind(actual = nb1$y, nb1_predictions) %>% # column-bind actual classes
select(doc_id, document, actual, prediction, probability) # organize variables
nb1_predictions_actual %>%
slice_head(n = 5) # preview
#> doc_id document actual prediction probability
#> 1 1 text1 Learner Learner 1
#> 2 3 text3 Learner Learner 1
#> 3 4 text4 Learner Learner 1
#> 4 5 text5 Learner Learner 1
#> 5 6 text6 Learner Learner 1
tab_class <-
table(nb1_predictions_actual$actual, # actual class labels
nb1_predictions_actual$prediction) # predicted class labels
caret::confusionMatrix(tab_class, mode = "prec_recall") # model performance statistics
#> Confusion Matrix and Statistics
#>
#>
#> Learner Native
#> Learner 516 0
#> Native 10 299
#>
#> Accuracy : 0.988
#> 95% CI : (0.978, 0.994)
#> No Information Rate : 0.638
#> P-Value [Acc > NIR] : < 2e-16
#>
#> Kappa : 0.974
#>
#> Mcnemar's Test P-Value : 0.00443
#>
#> Precision : 1.000
#> Recall : 0.981
#> F1 : 0.990
#> Prevalence : 0.638
#> Detection Rate : 0.625
#> Detection Prevalence : 0.625
#> Balanced Accuracy : 0.990
#>
#> 'Positive' Class : Learner
#>
predicted_class <- predict(nb1, newdata = nativeness_dfm_test)
actual_class <- nativeness_dfm_test$type
tab_class <- table(actual_class, predicted_class) # cross-tabulate actual and predicted class labels
caret::confusionMatrix(tab_class, mode = "prec_recall") # model performance statistics
#> Confusion Matrix and Statistics
#>
#> predicted_class
#> actual_class Learner Native
#> Learner 172 1
#> Native 9 93
#>
#> Accuracy : 0.964
#> 95% CI : (0.934, 0.982)
#> No Information Rate : 0.658
#> P-Value [Acc > NIR] : <2e-16
#>
#> Kappa : 0.921
#>
#> Mcnemar's Test P-Value : 0.0269
#>
#> Precision : 0.994
#> Recall : 0.950
#> F1 : 0.972
#> Prevalence : 0.658
#> Detection Rate : 0.625
#> Detection Prevalence : 0.629
#> Balanced Accuracy : 0.970
#>
#> 'Positive' Class : Learner
#>
9.2 Model training
Aim to use the create an abstraction of the patterns in the dataset
Feature engineering
Model selection
Model evaluation
The In Figure () Let’s consider the results from a hypothetical model of text classification on the SMS dataset I introduced at in this subsection.
- accuracy (measure of overall correct predictions)
- precision (measure of the quality of the predictions)
- Percentage of predicted ‘ham’ messages that were correct
- recall (measure of the quantity of the predictions)
- Percentage of actual ‘ham’ messages that were correct
- F1-score (summarizes the balance between precision and recall)
Avoiding overfitting
9.4 Evaluation
Evaluation of results
Relationship between predicted and actual classes in a confusion matrix as seen in Figure 9.1.