-
Notifications
You must be signed in to change notification settings - Fork 0
/
Creole Linguistics Analysis(Trial).Rmd
375 lines (244 loc) · 9.52 KB
/
Creole Linguistics Analysis(Trial).Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
---
title: "Creole Linguistics Analysis"
author: "Tariq Williams"
date: "April 2023"
output:
github_document:
toc: true
toc_depth: 2
---
```{r setup, include = FALSE}
# This setup chunk sets global defaults and includes the tidyverse packages
# The option include = FALSE prevents warnings and messages from printing to your report.
# Change the next option to echo = FALSE to hide code chunks by default
knitr::opts_chunk$set(echo = FALSE)
library(tidyverse)
library(tm)
library(readxl)
```
#Read in data
```{r}
Song_Scrape_Trial <- read_excel("Song Scrape Trial.xlsx")
english_words_patois_edit <- read.table("english_words_patois_edit.txt", header = FALSE, col.names = "words")
zongo <- read_excel("lyrics_3.xlsx")
lyrics_data <- read_excel("lyrics_3.xlsx")
```
#Data test
```{r}
# download the English Words
#download.file("https://raw.githubusercontent.com/dwyl/english-words/master/words.txt",
# destfile = "english_words.txt", quiet = TRUE)
# read in the Patois text
text <- "Dem love chat, dem nuh run dem block Run dem block, 1st Class Outside wid half-dozen gunman ready fi do the action"
# extract all words from the text
all_words <- unlist(strsplit(text, "\\W+"))
# read in the English Words
english_words <- readLines("english_words.txt")
# identify the English words in the Patois text
english_words_in_text <- all_words[all_words %in% english_words]
# print the identified English words
english_words_in_text
````
#Data test Trial 2
```{r}
# read in the Patois text
text <- "Dem love chat, dem nuh run dem block Run dem block, 1st Class Outside wid half-dozen gunman ready fi do the action"
# extract all words from the text
all_words <- unlist(strsplit(text, "\\W+"))
# read in the English Words
english_words <- readLines("english_words.txt")
# identify the English words in the Patois text
english_words_in_text <- all_words[all_words %in% english_words]
# print the identified English words
english_words_in_text
````
#Test 3
````{r}
# read in the English words
english_words <- readLines("english_words_patois_edit.txt")
# create an empty list to store the identified English words in each song
english_words_by_song <- list()
# loop over each row of the dataset and extract English words from the lyrics
for(i in 1:nrow(Song_Scrape_Trial)) {
# extract the lyrics of the current song
lyrics <- Song_Scrape_Trial$Lyrics[i]
# extract all words from the lyrics
all_words <- unlist(strsplit(lyrics, "\\W+"))
# identify the English words in the lyrics
english_words_in_song <- all_words[all_words %in% english_words]
# add the identified English words to the list
english_words_by_song[[i]] <- english_words_in_song
}
# print the identified English words in each song
for(i in 1:length(english_words_by_song)) {
cat("Song", i , "English words:", english_words_by_song[[i]], "\n")}
````
#Final Test
```{r}
# read in the English Words
english_words <- readLines("english_words_patois_edit.txt")
# create variables to store the counts
english_word_count <- numeric(nrow(Song_Scrape_Trial))
patois_word_count <- numeric(nrow(Song_Scrape_Trial))
# loop through each row in the table
for (i in 1:nrow(Song_Scrape_Trial)) {
# extract the lyrics from the row
lyrics <- Song_Scrape_Trial$Lyrics[i]
# split the lyrics into individual words
words <- unlist(strsplit(lyrics, "\\W+"))
# count the number of English and non-English words
english_word_count[i] <- sum(words %in% english_words)
patois_word_count[i] <- length(words) - english_word_count[i]
}
# create a data frame with the results
word_counts <- data.frame(Song_Scrape_Trial$Song, english_word_count, patois_word_count)
colnames(word_counts) <- c("Song", "English Words", "Patois Words")
````
#Final Test with new data frame 3
```{r}
# create variables to store the counts
english_word_count <- numeric(nrow(Try_2))
patois_word_count <- numeric(nrow(Try_2))
# loop through each row in the table
for (i in 1:nrow(Try_2)) {
# extract the lyrics from the row
lyrics <- Try_2$Lyrics[i]
# split the lyrics into individual words
words <- unlist(strsplit(lyrics, "\\W+"))
# count the number of English and non-English words
english_word_count[i] <- sum(words %in% english_words)
patois_word_count[i] <- length(words) - english_word_count[i]
}
# create a data frame with the results
word_counts <- data.frame(Try_2$Song, english_word_count, patois_word_count)
colnames(word_counts) <- c("Song", "English Words", "Patois Words")
````
#Final Test with new data frame 3
```{r}
# create variables to store the counts
english_word_count <- numeric(nrow(Try_2))
patois_word_count <- numeric(nrow(Try_2))
# loop through each row in the table
for (i in 1:nrow(Try_2)) {
# extract the lyrics from the row
lyrics <- Try_2$Lyrics[i]
# split the lyrics into individual words
words <- unlist(strsplit(lyrics, "\\W+"))
# count the number of English and non-English words
english_word_count[i] <- sum(words %in% english_words)
patois_word_count[i] <- length(words) - english_word_count[i]
}
# create a data frame with the results
word_counts <- data.frame(Try_2$Song, english_word_count, patois_word_count)
colnames(word_counts) <- c("Song", "English Words", "Patois Words")
````
#Verification
```{r}
# read in the English Words
english_words <- readLines("english_words_patois_edit.txt")
# convert all the words in the English dictionary to lowercase
english_words <- tolower(english_words)
# initialize lists to store english and patois words
english_words_list <- list()
patois_words_list <- list()
# loop through each row in the Song_Scrape_Trial data frame
for (i in 1:nrow(lyrics_data)) {
# extract lyrics from the current row
lyrics <- lyrics_data[i, "Lyrics"]
# convert all the words in the lyrics to lowercase
lyrics <- tolower(lyrics)
# extract all words from the lyrics
all_words <- unlist(strsplit(as.character(lyrics), "\\W+"))
# identify the English words in the lyrics
english_words_in_lyrics <- all_words[all_words %in% english_words]
# add the English words to the english_words_list
english_words_list[[i]] <- english_words_in_lyrics
# add the patois words to the patois_words_list
patois_words_list[[i]] <- all_words[!all_words %in% english_words]
}
# flatten the english and patois word lists
english_words <- unlist(english_words_list)
patois_words <- unlist(patois_words_list)
# print all the English words found
cat("English words:\n")
print(unique(english_words))
# print all the patois words found
cat("\n Patois words:\n")
print(unique(patois_words))
`````
# Specified Verification W/ lyrics_data (25 variables)
`````{r}
# remove words ending with "embed" from the Lyrics column
lyrics_data$Lyrics <- gsub("\\b\\w+embed\\b", "", lyrics_data$Lyrics, ignore.case = TRUE)
``````
`````{r}
# read in the English Words
english_words <- readLines("english_words_patois_edit.txt")
# convert all the words in the English dictionary to lowercase
english_words <- tolower(english_words)
# initialize lists to store english and patois words
english_words_list <- list()
patois_words_list <- list()
# specify the song to analyze
song_index <- 1
# loop through each row in the Song_Scrape_Trial data frame
for (i in 1:nrow(lyrics_data)) {
# extract lyrics from the current row
lyrics <- lyrics_data[song_index, "Lyrics"]
# convert all the words in the lyrics to lowercase
lyrics <- tolower(lyrics)
# extract all words from the lyrics
all_words <- unlist(strsplit(as.character(lyrics), "\\W+"))
# identify the English words in the lyrics
english_words_in_lyrics <- all_words[all_words %in% english_words]
# add the English words to the english_words_list
english_words_list[[i]] <- english_words_in_lyrics
# add the patois words to the patois_words_list
patois_words_list[[i]] <- all_words[!all_words %in% english_words]
}
# flatten the english and patois word lists
english_words <- unlist(english_words_list)
patois_words <- unlist(patois_words_list)
# print all the English words found
cat("English words:\n")
print(unique(english_words))
# print all the patois words found
cat("\n Patois words:\n")
print(unique(patois_words))
`````
# Specified Verification W/ lyrics_final (100 variables)
`````{r}
# read in the English Words
english_words <- readLines("english_words_patois_edit.txt")
# convert all the words in the English dictionary to lowercase
english_words <- tolower(english_words)
# initialize lists to store english and patois words
english_words_list <- list()
patois_words_list <- list()
# specify the song to analyze
song_index <- 1
# loop through each row in the Song_Scrape_Trial data frame
for (i in 1:nrow(lyrics_final)) {
# extract lyrics from the current row
lyrics <- lyrics_final[song_index, "Lyrics"]
# convert all the words in the lyrics to lowercase
lyrics <- tolower(lyrics)
# extract all words from the lyrics
all_words <- unlist(strsplit(as.character(lyrics), "\\W+"))
# identify the English words in the lyrics
english_words_in_lyrics <- all_words[all_words %in% english_words]
# add the English words to the english_words_list
english_words_list[[i]] <- english_words_in_lyrics
# add the patois words to the patois_words_list
patois_words_list[[i]] <- all_words[!all_words %in% english_words]
}
# flatten the english and patois word lists
english_words <- unlist(english_words_list)
patois_words <- unlist(patois_words_list)
# print all the English words found
cat("English words:\n")
print(unique(english_words))
# print all the patois words found
cat("\n Patois words:\n")
print(unique(patois_words))
`````