forked from amunategui/Chatbot-Conversations
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathIntentModeling_post_R_version.Rmd
189 lines (154 loc) · 5.44 KB
/
IntentModeling_post_R_version.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
---
title: "Chatbot_Parallelize_spacyr"
author: "Jasen Mackie"
date: "02/01/2022"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
Initialize spacyr.
```{r initialize spacy}
library(spacyr)
# spacy_install() # run this once, to create a new conda environment "spacy_condaenv"
spacy_initialize()
```
Read data.
```{r read data}
library(data.table)
df <- fread("complaints.csv", select = c("Product", "Consumer complaint narrative", "Sub-issue"))
df <- df[df$`Consumer complaint narrative` != "",]
# df <- fread("shuf -n 200000 complaints.csv")
set.seed(123)
df <- df[sample(1:nrow(df), 200000)]
```
Parse with spacyr. I use 8 cores and foreach on an Ubuntu machine.
```{r spacy parse}
t1 <- Sys.time()
library(doParallel)
library(parallel)
registerDoParallel(8)
library(foreach)
require(magrittr)
require(dplyr)
t1 <- Sys.time()
x <- foreach(i = 1:nrow(df)) %dopar% {
spacy_parse(df$`Consumer complaint narrative`[i],
lemma = FALSE,
entity = TRUE,
nounphrase = TRUE) %>%
mutate(complaint_id = df$`Complaint ID`[i])
}
t2 <- Sys.time()
t2 - t1
```
Some observations (22 our of 200k) do not have noun phrases, so we remove those. The reason is the message is too short.
```{r remove incomplete observations}
length(x)
col_count <- do.call("rbind", lapply(x, ncol))
head(col_count)
length(which(col_count != 8))
```
After removing the incomplete messages we rbind the list elements into a dataframe.
```{r convert_process}
x <- x[-which(col_count != 8)]
parsed_text <- do.call("rbind", x)
```
We start counting verbs, nouns and noun phrases.
```{r verb_noun_count}
library(magrittr)
library(dplyr)
verb_count <- parsed_text %>% filter(pos=="VERB") %>% count(token, sort=TRUE)
t(head(verb_count, 10))
nrow(verb_count)
noun_count <- parsed_text %>% filter(pos=="NOUN") %>% count(token, sort=TRUE)
t(head(noun_count, 10))
nrow(noun_count)
t1 <- Sys.time()
noun_phrases <- nounphrase_consolidate(parsed_text)
t2 <- Sys.time()
t2-t1
```
```{r ok_real_count_this_time}
incl_verbs <- verb_count[verb_count$n >= 10000,]
head(incl_verbs)
nrow(incl_verbs)
incl_nouns <- noun_count[noun_count$n >= 10000,]
head(incl_nouns)
nrow(incl_nouns)
nounphrase_count <- noun_phrases %>% filter(pos=="nounphrase") %>% count(token, sort=TRUE)
t(head(nounphrase_count, 100))
nrow(nounphrase_count)
incl_noun_phrases <- nounphrase_count[nounphrase_count$n >= 10000,]
head(incl_noun_phrases)
nrow(incl_noun_phrases)
```
Build a one-hot encoded matrix of the chosen verbs, nouns and noun phrases.
```{r binary_lines}
binary_lines <- matrix(0, nrow = nrow(df), ncol = (nrow(incl_nouns)+nrow(incl_verbs)+nrow(incl_noun_phrases)))
# binary_lines <- matrix(0, nrow = nrow(new_df), ncol = (nrow(incl_nouns)+nrow(incl_verbs)))
dim(binary_lines)
```
```{r one-hot encode}
# Replace spaces in between noun phrases with undercores so we can use them as single tokens
incl_noun_phrases$unjoined_nounphrase <- gsub("_", " ", incl_noun_phrases$token)
# incl_noun_phrases$unjoined_nounphrase <- str_replace_all(as.character(incl_noun_phrases$token), setNames("_", " "))
head(incl_noun_phrases, 20)
head(as.character(incl_noun_phrases$unjoined_nounphrase))
# TODO: convert to foreach
library(stringr)
t1 <- Sys.time()
for(i in 1:length(df$`Consumer complaint narrative`)){
if(i %% 10000 == 0) {
print(i)
}
binary_lines[i,] <- str_detect(df$`Consumer complaint narrative`[i] , c(as.character(incl_noun_phrases$unjoined_nounphrase), as.character(incl_nouns$token), as.character(incl_verbs$token)))
# binary_lines[i,] <- str_detect(new_df$clean_text[i] , c(as.character(incl_nouns$lemma), as.character(incl_verbs$lemma))) # use if we exclude noun phrases
}
t2 <- Sys.time()
t2 - t1
colnames(binary_lines) <- c(as.character(incl_noun_phrases$token), as.character(incl_nouns$token), as.character(incl_verbs$token))
head(binary_lines,10)
```
We need to convert the noun phrases in the original text to the underscore joined version so we can correctly conduct an n-gram analysis using the tokenized noun phrases.
```{r convert_original_text_to_joined_nounphrase}
t1 <- Sys.time()
for(i in 1:length(df$`Consumer complaint narrative`)) {
# for(i in 1:2) {
if(i %% 10000 == 0) {
print(i)
}
df$`Consumer complaint narrative`[i] <- str_replace_all(df$`Consumer complaint narrative`[i], setNames(c(as.character(incl_noun_phrases$token)), c(as.character(incl_noun_phrases$unjoined_nounphrase))))
}
t2 <- Sys.time()
t2 - t1
```
Do some kmeans clustering.
```{r kmeans}
kmeans <- kmeans(binary_lines, centers = 50)
```
Inspect the number of observations per cluster, and asign a cluster identifier to each observation in the original dataframe.
```{r inspect kmeans}
table(kmeans$cluster)
sum(table(kmeans$cluster))
head(kmeans$cluster)
df$cluster <- kmeans$cluster
df$`Consumer complaint narrative`[head(kmeans$cluster)]
```
Conduct n-gram analysis on cluster 15.
```{r n-grams}
# Check n-grams for nth cluster
library(tidytext)
chat_ngrams <- df[which(df$cluster == 15),] %>%
unnest_tokens(ngram, `Consumer complaint narrative`, token = "ngrams", n = 4)
head(chat_ngrams$ngram, 20)
# head(new_lines_tidytext_df[which(new_lines_tidytext_df$cluster == 38),])
nrow(chat_ngrams)
chat_ngrams %>%
count(ngram, sort = TRUE)
```
Tie back the chosen keywords to the actual complaints for further investigation.
```{r keywords}
keyword_bool <- str_detect(df$`Consumer complaint narrative`, "a victim of identity_theft")
keyword_df <- df$`Consumer complaint narrative`[keyword_bool]
```