library(DBI)
library(RSQLite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v purrr 0.3.4
## v tidyr 1.1.2 v forcats 0.5.0
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Set up a database connection to the SMScorpus.db
file.
# Set up database connection
drv = dbDriver("SQLite")
con = dbConnect(drv, dbname = "SMScorpus.db")
List the tables in our database. Which table contains the actual SMS data? Use a SQL query to show the first 10 rows of the table.
new_sms_download
table.# List tables in our database
dbListTables(con)
## [1] "new_sms_download" "sqlite_sequence"
# List fields in our SMS table
dbListFields(con, "new_sms_download")
## [1] "sender" "receiver" "send_time" "collect_time"
## [5] "collect_method" "content" "native" "country"
## [9] "age" "input_method" "experience" "frequency"
## [13] "phone_model" "collector" "gender" "smartphone"
## [17] "lang" "city" "id"
# Show first 10 rows of table wit SQL query
dbGetQuery(con, "SELECT * FROM new_sms_download LIMIT 10")
## sender receiver
## 1 79780a9dbe83fd1e5dd2bd2543e7da2a 0ffc7585148560b7520931d354c00a9b
## 2 79780a9dbe83fd1e5dd2bd2543e7da2a ce3e0c80307aea9b4f656890d19735b3
## 3 79780a9dbe83fd1e5dd2bd2543e7da2a 0ffc7585148560b7520931d354c00a9b
## 4 79780a9dbe83fd1e5dd2bd2543e7da2a 0ffc7585148560b7520931d354c00a9b
## 5 79780a9dbe83fd1e5dd2bd2543e7da2a 0ffc7585148560b7520931d354c00a9b
## 6 79780a9dbe83fd1e5dd2bd2543e7da2a 0ffc7585148560b7520931d354c00a9b
## 7 79780a9dbe83fd1e5dd2bd2543e7da2a 279fab0955409c56520931d354c00a9b
## 8 79780a9dbe83fd1e5dd2bd2543e7da2a 0ffc7585148560b7520931d354c00a9b
## 9 79780a9dbe83fd1e5dd2bd2543e7da2a 0ffc7585148560b7520931d354c00a9b
## 10 79780a9dbe83fd1e5dd2bd2543e7da2a 0ffc7585148560b7520931d354c00a9b
## send_time collect_time collect_method
## 1 2010.10.24 11:59 2010/11 SMS Export
## 2 2010.10.28 11:53 2010/11 SMS Export
## 3 2010.10.26 22:21 2010/11 SMS Export
## 4 2010.10.25 20:27 2010/11 SMS Export
## 5 2010.10.25 08:33 2010/11 SMS Export
## 6 2010.10.23 18:06 2010/11 SMS Export
## 7 2010.10.27 21:41 2010/11 SMS Export
## 8 2010.10.26 14:49 2010/11 SMS Export
## 9 2010.10.25 17:49 2010/11 SMS Export
## 10 2010.10.24 19:20 2010/11 SMS Export
## content native country age
## 1 K yes India 21-25
## 2 Studying? yes India 21-25
## 3 Vch photo yes India 21-25
## 4 K:-)ya i hav to finish yes India 21-25
## 5 K yes India 21-25
## 6 One senioq akka yes India 21-25
## 7 K d yes India 21-25
## 8 She vil mistake me only cha.dnt talk to me also yes India 21-25
## 9 I am standing up yes India 21-25
## 10 K yes India 21-25
## input_method experience frequency phone_model collector
## 1 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 2 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 3 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 4 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 5 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 6 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 7 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 8 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 9 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## 10 Multi-tap 3 to 5 years More than 50 SMS daily Nokia Tao Chen
## gender smartphone lang city id
## 1 unknown unknown en Tiruppur 1
## 2 unknown unknown en Tiruppur 2
## 3 unknown unknown en Tiruppur 3
## 4 unknown unknown en Tiruppur 4
## 5 unknown unknown en Tiruppur 5
## 6 unknown unknown en Tiruppur 6
## 7 unknown unknown en Tiruppur 7
## 8 unknown unknown en Tiruppur 8
## 9 unknown unknown en Tiruppur 9
## 10 unknown unknown en Tiruppur 10
Let’s start working on the XML file. Our goal is to extract from it a data frame which is as close as possible to the DB table. First, read the contents of the XML file into a dataframe named SMScorpus_xml_raw
.
SMScorpus_xml_raw <- readLines("smsCorpus_en_2015.03.09_all.xml", warn = FALSE)
Next, create a new dataframe called SMScorpus_xml
that only contains actual messages.
message_pattern <- "<message id="
# This variable only contains rows of the raw data with "<message id="
SMScorpus <- SMScorpus_xml_raw[grep(x = SMScorpus_xml_raw, pattern = message_pattern)]
# Substitute everything that is not part of a text message
text_exp <- "(.*)<text>|</text>(.*)"
texts <- gsub(text_exp, "", SMScorpus)
# This variable only contains the text message content
SMScorpus_xml <- data.frame(matrix(unlist(texts), nrow = length(texts)))
colnames(SMScorpus_xml) <- c("content")
Verify that the number of messages is the same in both your XML version and the DB version. Assign that number as a numeric scalar to a variable called N.
N0 = nrow(SMScorpus_xml)
N = nrow(dbGetQuery(con, "SELECT * FROM new_sms_download"))
N0
## [1] 55835
N
## [1] 55835
Are the two data sets the same? In order to answer the question, we will extract the data from the XML file using string manipulations and store it in the same dimensions and variable names as the DB version.
On examining the XML structure, you’ll find that some fields are defined with an equal sign (“=”) and some in a different way.
Write a function called getXMLfield
which takes the following inputs:
field
- the name of the field to extract from each of the XML rows.dat
- the character vector object that contains the XML rows.with_equal_sign
- a logical value that tells the function whether the field is defined with an equal sign or not.type_of_result
- a string whose default value should be “chr”. If it is set to “int”, the function will convert the result to an integer vector.The output will be a vector that contains the values of the field
variable.
In addition, the function should convert the XML escaped characters to their actual value.
# The data we want to collect
SMScorpus_db_df <- dbGetQuery(con, "SELECT * FROM new_sms_download")
slice(SMScorpus_db_df, 45719)
## sender receiver send_time collect_time collect_method content
## 1 51 unknown unknown 2003/4 unknown Bugis oso near wat...
## native country age input_method experience frequency phone_model
## 1 unknown SG unknown unknown unknown unknown unknown
## collector gender smartphone lang city id
## 1 howyijue unknown unknown en unknown 45719
dat
, with_equal_sign
, and type_of_result
variables.field
as the name of the field in which the data frame column will be named rather than the name of the field in the raw data. It was easier for me to keep track of which values will correspond to what in the DB data file. Due to this, the raw “time” field was not vectorized for returning their values.# SMScorpus only contains rows of the raw data with "<message id="
row1 <- SMScorpus[[1]]
# What the input corresponds to
fields <- c("message id", "text", "srcNumber", "manufactuer", "smartphone", "age", "gender", "nativeSpeaker", "country", "city", "experience", "frequency", "inputMethod", "language", "time", "collector", "time", "destNumber", "method")
# What the user inputs
field_names <- c("id", "content", "sender", "phone_model", "smartphone", "age", "gender", "native", "country", "city", "experience", "frequency", "input_method", "lang", "send_time", "collector", "collect_time", "receiver", "collect_method" )
names(fields) <- field_names
getXMLfield_og <- function(field, dat, with_equal_sign, type_of_result="chr"){
field_name <- fields[field][[1]]
if (field_name == "time"){
if (field == "send_time"){
exp <- "<messageProfile(.*?)><"
}
else if (field == "collect_time"){
exp <- "<collectionMethod(.*?)><"
}
matches <- gregexpr(pattern = exp, tex = dat)
rmatches <- regmatches(x=dat, m=matches)
}
else {
rmatches <- dat
}
if (with_equal_sign == TRUE) {
exp <- paste("(?<=", field_name, "=\")(.*?)(?=\")", sep="")
}
else if (with_equal_sign == FALSE) {
exp <- paste("(?<=", field_name, ">)(.*?)(?=</", field_name, ")", sep="")
}
matches <- gregexpr(pattern = exp, text = rmatches, perl=TRUE)
result <- regmatches(x=rmatches, m=matches)
if (field == "content"){
result <- gsub(""", "\"", result)
result <- gsub("'", "'", result)
result <- gsub("<", "<", result)
result <- gsub(">", ">", result)
result <- gsub("&", "&", result)
}
if (type_of_result == "int") {
result <- as.integer(result)
}
else if (type_of_result == "chr") {
result <- as.character(result)
}
return(result)
}
getXMLfield_og("sender", row1, FALSE)
## [1] "51"
getXMLfield_og("sender", row1, FALSE, "int")
## [1] 51
getXMLfield_og("send_time", row1, TRUE)
## [1] "unknown"
getXMLfield_og("collect_time", row1, TRUE)
## [1] "2003/4"
do.call(rbind, listname)
which worked. However, it wasn’t keeping any null values which made the data frame length of 55796 when it really needs to have N = 55835 rows.rbindlst()
which acts essentially the same but it does keep null values! So, the cell block below shows the package I needed to install in order to use this function.rbindlst()
function is that I had to transpose the data using t()
.#install.packages("data.table")
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
dat
, with_equal_sign
, and type_of_result
variables.field
corresponds to the name of the field in the raw data. This will allow for vectorized return of the hierarchical data for “time”.# SMScorpus only contains rows of the raw data with "<message id="
row1 <- SMScorpus[[1]]
# What the user inputs
fields <- c("message id", "text", "srcNumber", "manufactuer", "smartphone", "age", "gender", "nativeSpeaker", "country", "city", "experience", "frequency", "inputMethod", "language", "time", "collector", "time", "destNumber", "method")
getXMLfield <- function(field, dat, with_equal_sign, type_of_result="chr"){
field_name <- field
rmatches <- dat
if (with_equal_sign == TRUE) {
exp <- paste("(?<=", field_name, "=\")(.*?)(?=\")", sep="")
}
else if (with_equal_sign == FALSE) {
exp <- paste("(?<=", field_name, ">)(.*?)(?=</", field_name, ")", sep="")
}
matches <- gregexpr(pattern = exp, text = rmatches, perl=TRUE)
result <- regmatches(x=rmatches, m=matches)
if (field_name == "text"){
result <- gsub(""", "\"", result)
result <- gsub("'", "'", result)
result <- gsub("<", "<", result)
result <- gsub(">", ">", result)
result <- gsub("&", "&", result)
}
if (type_of_result == "int") {
result <- as.integer(result)
}
else if (typeof(result) == "list"){
result <- rbindlist(list(result))
result <- t(result)
}
else {
result <- as.character(result)
}
return(result)
}
getXMLfield("srcNumber", row1, FALSE)
## [,1]
## V1 "51"
getXMLfield("srcNumber", row1, FALSE, "int")
## [1] 51
N == length(complete.cases(SMScorpus_db_df$send_time)) # TRUE
## [1] TRUE
N == length(complete.cases(SMScorpus_db_df$collect_time)) # TRUE
## [1] TRUE
Once you are happy with your function, you should create a new data frame for the XML data.
DBfields <- dbListFields(con, "new_sms_download")
SMScorpus_xml_df <- data.frame(matrix(NA, nrow = N, ncol = length(DBfields)))
colnames(SMScorpus_xml_df) <- DBfields
# Make sure we know all of the field names
sort(DBfields) == sort(field_names)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE
DBfields
## [1] "sender" "receiver" "send_time" "collect_time"
## [5] "collect_method" "content" "native" "country"
## [9] "age" "input_method" "experience" "frequency"
## [13] "phone_model" "collector" "gender" "smartphone"
## [17] "lang" "city" "id"
This initializes a data frame that has the same number of rows and the same columns (alongside their names) as the SMS DB file.
We will ignore here two columns from the DB version, send_time and collect_time. So execute:
SMScorpus_xml_df$send_time <- NULL
SMScorpus_xml_df$collect_time <- NULL
# Trying to handle "time" cases automatically
SMScorpus_xml_df[, c("send_time", "collect_time")] <-
getXMLfield(field = "time", dat = SMScorpus, with_equal_sign = TRUE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
You should fill all your columns using the function you created. You need to run it once for each variable. Take care to choose the names correctly!
SMScorpus_xml_df$collect_method <-
getXMLfield("method", SMScorpus, TRUE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$receiver <-
getXMLfield("destNumber", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$id <-
getXMLfield("message id", SMScorpus, TRUE, "int")
SMScorpus_xml_df$content <-
getXMLfield("text", SMScorpus, FALSE)
SMScorpus_xml_df$sender <-
getXMLfield("srcNumber", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$phone_model <-
getXMLfield("manufactuer", SMScorpus, TRUE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$smartphone <-
getXMLfield("smartphone", SMScorpus, TRUE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$age <-
getXMLfield("age", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$gender <-
getXMLfield("gender", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$native <-
getXMLfield("nativeSpeaker", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$country <-
getXMLfield("country", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$city <-
getXMLfield("city", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$experience <-
getXMLfield("experience", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$frequency <-
getXMLfield("frequency", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$input_method <-
getXMLfield("inputMethod", SMScorpus, FALSE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$lang <-
getXMLfield("language", SMScorpus, TRUE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
SMScorpus_xml_df$collector <-
getXMLfield("collector", SMScorpus, TRUE)
## Warning in rbindlist(list(result)): Column 14154 [''] of item 1 is length 0.
## This (and 38 others like it) has been filled with NA (NULL for list columns) to
## make each item uniform.
Print rows 1-10, 20001-20010 and 40001-40010 of your resulting data frame.
SMScorpus_xml_df[1:10,]
## sender receiver collect_method
## 1 51 unknown unknown
## 2 51 unknown unknown
## 3 51 unknown unknown
## 4 51 unknown unknown
## 5 51 unknown unknown
## 6 51 unknown unknown
## 7 51 unknown unknown
## 8 51 unknown unknown
## 9 51 unknown unknown
## 10 51 unknown unknown
## content
## 1 Bugis oso near wat...
## 2 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
## 3 I dunno until when... Lets go learn pilates...
## 4 Den only weekdays got special price... Haiz... Cant eat liao... Cut nails oso muz wait until i finish drivin wat, lunch still muz eat wat...
## 5 Meet after lunch la...
## 6 m walking in citylink now ü faster come down... Me very hungry...
## 7 5 nights...We nt staying at port step liao...Too ex
## 8 Hey pple...$700 or $900 for 5 nights...Excellent location wif breakfast hamper!!!
## 9 Yun ah.the ubi one say if ü wan call by tomorrow.call 67441233 look for irene.ere only got bus8,22,65,61,66,382. Ubi cres,ubi tech park.6ph for 1st 5wkg days.èn
## 10 Hey tmr maybe can meet you at yck
## native country age input_method experience frequency phone_model
## 1 unknown SG unknown unknown unknown unknown unknown
## 2 unknown SG unknown unknown unknown unknown unknown
## 3 unknown SG unknown unknown unknown unknown unknown
## 4 unknown SG unknown unknown unknown unknown unknown
## 5 unknown SG unknown unknown unknown unknown unknown
## 6 unknown SG unknown unknown unknown unknown unknown
## 7 unknown SG unknown unknown unknown unknown unknown
## 8 unknown SG unknown unknown unknown unknown unknown
## 9 unknown SG unknown unknown unknown unknown unknown
## 10 unknown SG unknown unknown unknown unknown unknown
## collector gender smartphone lang city id send_time collect_time
## 1 howyijue unknown unknown en unknown 10120 unknown 2003/4
## 2 howyijue unknown unknown en unknown 10121 unknown 2003/4
## 3 howyijue unknown unknown en unknown 10122 unknown 2003/4
## 4 howyijue unknown unknown en unknown 10123 unknown 2003/4
## 5 howyijue unknown unknown en unknown 10124 unknown 2003/4
## 6 howyijue unknown unknown en unknown 10125 unknown 2003/4
## 7 howyijue unknown unknown en unknown 10126 unknown 2003/4
## 8 howyijue unknown unknown en unknown 10127 unknown 2003/4
## 9 howyijue unknown unknown en unknown 10128 unknown 2003/4
## 10 howyijue unknown unknown en unknown 10129 unknown 2003/4
SMScorpus_xml_df[20001:20010,]
## sender
## 20001 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20002 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20003 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20004 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20005 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20006 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20007 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20008 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20009 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## 20010 c7adea30c5e9bac455066af49acbf76eb63bd4324e6d431c
## receiver collect_method
## 20001 6c82416ba4976764d20a707d2be75e0e SMS Upload
## 20002 6c82416ba4976764d20a707d2be75e0e SMS Upload
## 20003 6c82416ba4976764d20a707d2be75e0e SMS Upload
## 20004 6c82416ba4976764d20a707d2be75e0e SMS Upload
## 20005 6c82416ba4976764d20a707d2be75e0e SMS Upload
## 20006 6c82416ba4976764d20a707d2be75e0e SMS Upload
## 20007 6c82416ba4976764d20a707d2be75e0e SMS Upload
## 20008 0a5f29745d0ae10cd72f84eef4ae098e SMS Upload
## 20009 0a5f29745d0ae10cd72f84eef4ae098e SMS Upload
## 20010 0a5f29745d0ae10cd72f84eef4ae098e SMS Upload
## content
## 20001 Yeah hopefully, if tyler can't do it I could maybe ask around a bit
## 20002 Haha, that was the first person I was gonna ask
## 20003 Gumby's has a special where a <#> " cheese pizza is $2 so I know what we're doin tonight
## 20004 What
## 20005 Sweeeeetg
## 20006 Ugh fucking typos
## 20007 Aight
## 20008 Jus chillin, you?
## 20009 Yeah why not, is the gang all ready
## 20010 Then I'm on mah way
## native country age input_method experience frequency
## 20001 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20002 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20003 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20004 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20005 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20006 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20007 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20008 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20009 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## 20010 yes United States 16-20 Multi-tap 3 to 5 years 5 to 10 SMS daily
## phone_model collector gender smartphone lang city id
## 20001 Motorola Tao Chen male yes en Tampa 9884
## 20002 Motorola Tao Chen male yes en Tampa 9885
## 20003 Motorola Tao Chen male yes en Tampa 9886
## 20004 Motorola Tao Chen male yes en Tampa 9887
## 20005 Motorola Tao Chen male yes en Tampa 9888
## 20006 Motorola Tao Chen male yes en Tampa 9889
## 20007 Motorola Tao Chen male yes en Tampa 9890
## 20008 Motorola Tao Chen male yes en Tampa 9891
## 20009 Motorola Tao Chen male yes en Tampa 9892
## 20010 Motorola Tao Chen male yes en Tampa 9893
## send_time collect_time
## 20001 2011.01.31 03:00:26 2011/2
## 20002 2011.01.31 03:02:10 2011/2
## 20003 2011.01.31 04:47:34 2011/2
## 20004 2011.01.31 04:48:30 2011/2
## 20005 2011.01.31 04:49:13 2011/2
## 20006 2011.01.31 04:49:32 2011/2
## 20007 2011.01.31 06:08:23 2011/2
## 20008 2011.01.31 09:45:06 2011/2
## 20009 2011.01.31 09:48:14 2011/2
## 20010 2011.01.31 09:53:46 2011/2
SMScorpus_xml_df[40001:40010,]
## sender
## 40001 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40002 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40003 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40004 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40005 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40006 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40007 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40008 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40009 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## 40010 5947942eb22c76ff040d86eff45702cf8828029e5ca2413a
## receiver collect_method
## 40001 c4caecfe70266e7f3aa46133188d52d8 SMS Upload
## 40002 fd0ccca246248842ffb06152f32af4cf SMS Upload
## 40003 98538082b86a54636a8d70e2f8bf0594 SMS Upload
## 40004 68fdfb1e1ab3a701ffb06152f32af4cf SMS Upload
## 40005 c4caecfe70266e7f3aa46133188d52d8 SMS Upload
## 40006 7fe7c24ccbab1fbaffb06152f32af4cf SMS Upload
## 40007 dac3eb044800d99ba6bc6bf73f59ee85 SMS Upload
## 40008 c4caecfe70266e7f3aa46133188d52d8 SMS Upload
## 40009 c4caecfe70266e7f3aa46133188d52d8 SMS Upload
## 40010 c4caecfe70266e7f3aa46133188d52d8 SMS Upload
## content native country age
## 40001 Lols okay nvm yes Singapore 16-20
## 40002 Ps I 2moro can't go yes Singapore 16-20
## 40003 Ps I 2moro can't go yes Singapore 16-20
## 40004 Send me ur flight details? yes Singapore 16-20
## 40005 Yea its okay. U weren't late yest yes Singapore 16-20
## 40006 Wth father day 3rd Sunday of june yes Singapore 16-20
## 40007 Hey so is it on <#> rd yes Singapore 16-20
## 40008 Wahh pangsehhhhhhhhh yes Singapore 16-20
## 40009 Lol jkjk I was using AYE msg as template yes Singapore 16-20
## 40010 Yea enjoy while we can yes Singapore 16-20
## input_method experience frequency phone_model
## 40001 Predictive More than 10 years More than 50 SMS daily Samsung
## 40002 Predictive More than 10 years More than 50 SMS daily Samsung
## 40003 Predictive More than 10 years More than 50 SMS daily Samsung
## 40004 Predictive More than 10 years More than 50 SMS daily Samsung
## 40005 Predictive More than 10 years More than 50 SMS daily Samsung
## 40006 Predictive More than 10 years More than 50 SMS daily Samsung
## 40007 Predictive More than 10 years More than 50 SMS daily Samsung
## 40008 Predictive More than 10 years More than 50 SMS daily Samsung
## 40009 Predictive More than 10 years More than 50 SMS daily Samsung
## 40010 Predictive More than 10 years More than 50 SMS daily Samsung
## collector gender smartphone lang city id send_time
## 40001 Tao Chen male yes en Singapore 29884 2011.06.09 09:01:42
## 40002 Tao Chen male yes en Singapore 29885 2011.06.09 10:30:42
## 40003 Tao Chen male yes en Singapore 29886 2011.06.09 10:30:42
## 40004 Tao Chen male yes en Singapore 29887 2011.06.09 10:12:25
## 40005 Tao Chen male yes en Singapore 29888 2011.06.10 08:29:58
## 40006 Tao Chen male yes en Singapore 29889 2011.06.10 03:05:53
## 40007 Tao Chen male yes en Singapore 29890 2011.06.10 04:39:30
## 40008 Tao Chen male yes en Singapore 29891 2011.06.10 05:36:12
## 40009 Tao Chen male yes en Singapore 29892 2011.06.10 05:37:42
## 40010 Tao Chen male yes en Singapore 29893 2011.06.10 05:38:55
## collect_time
## 40001 2011/11
## 40002 2011/11
## 40003 2011/11
## 40004 2011/11
## 40005 2011/11
## 40006 2011/11
## 40007 2011/11
## 40008 2011/11
## 40009 2011/11
## 40010 2011/11
Let us now check our new data frame that is based on the XML file.
What is the main glitch in the XML data? Compare your findings to the DB version, using only direct SQL queries. (hint: look for a variable that should have unique entries for each row)
# Print DB data in ascending order
SMSdb <- dbGetQuery(con, "SELECT content FROM new_sms_download ORDER BY content LIMIT 5")
SMSdb
## content
## 1 \nLol how did u get put in an arm bar.
## 2 _2_|_7_|_6_ _9_|_5_|_1_ _4_|_3_|_8_
## 3 Tere dil ka dard kis ne dekha ha, "Nargis"...,Sab kanjar hain, sirf mammay dekhte hain...!
## 4 <#>
## 5 australia ke elawa aur bhi jagan ab ho raha ha implement..BROADBAND POWER LINE
# Print XML data in ascending order
xml_sorted <- SMScorpus_xml_df[order(SMScorpus_xml_df$content),]
row.names(xml_sorted) <- NULL
SMSxml <- xml_sorted['content']
head(SMSxml,5)
## content
## 1 '
## 2 '
## 3 '
## 4 ''An Amazing Quote'' - "Sometimes in life its difficult to decide whats wrong!! a lie that brings a smile or the truth that brings a tear...."
## 5 ''Life is beautiful only for those who knw how to celebrate the PAIN".-.. :-)Gud nite :-)
ü
is written as “ü”. A similar thing occurs for characters that are in a different language. For example, the character 鈥
is written as 鈥
. This glitch also makes it so the data are not comparable in ascending order.Several text messages in the XML data are corrupt. Find those using functions like table()
, which()
(or others) and show them. Compare those to the correct messages in the DB version. For full credit, use only SQL queries for the latter. For partial credit you can copy the table into an R data frame and examine it directly with dplyr
or other commands. Identify the issue that causes the discrepancy.
sum(is.na(SMScorpus_xml_df$content)) # zero
## [1] 0
sum(is.nan(SMScorpus_xml_df$content)) # zero
## [1] 0
sum(SMScorpus_xml_df$content == "") # zero
## [1] 0
sum(SMScorpus_xml_df$content == "NULL") # zero
## [1] 0
sum(is.null(SMScorpus_xml_df$content)) # zero
## [1] 0
anyNA(SMScorpus_xml_df$content) # FALSE
## [1] FALSE
# Are all cases complete?
N == length(complete.cases(SMScorpus_xml_df$content)) # TRUE
## [1] TRUE
tt <- as.data.frame(table(SMScorpus_xml_df$content)) # TRUE
N == sum(tt$Freq)
## [1] TRUE
sum(is.na(SMScorpus_xml_df)) # 663
## [1] 663
sum(is.na(SMScorpus_db_df)) # 0
## [1] 0
colSums(is.na(SMScorpus_xml_df)) # 17 columns contain 39
## content
## 39 39 39 0 39 39
##
## 39 39 39 39 39 39
## id send_time
## 39 39 39 39 0 39
## collect_time
## 39
rows <- SMScorpus_xml_df %>%
rowid_to_column() %>%
filter(is.na(sender))
rowids <- rows$rowid
rowids # these are the 39 rows that have the NA values
## [1] 14154 14312 14432 15041 15068 15070 15145 15356 15362 15363 15364 15365
## [13] 15386 15442 15524 15525 20876 20894 20895 20896 20897 20898 37509 37519
## [25] 37542 37550 37561 37578 37622 37624 37625 37626 37627 37630 37639 37642
## [37] 37698 50651 51171
table()
function. All cases are complete, i.e., have no missing values in the texts.SMScorpus_xml_df[rowids,]
## sender receiver collect_method content native country age
## 14154 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 14312 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 14432 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15041 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15068 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15070 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15145 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15356 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15362 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15363 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15364 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15365 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15386 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15442 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15524 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 15525 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 20876 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 20894 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 20895 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 20896 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 20897 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 20898 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37509 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37519 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37542 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37550 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37561 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37578 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37622 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37624 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37625 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37626 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37627 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37630 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37639 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37642 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 37698 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 50651 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## 51171 <NA> <NA> <NA> character(0) <NA> <NA> <NA>
## input_method experience frequency phone_model collector gender smartphone
## 14154 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 14312 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 14432 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15041 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15068 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15070 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15145 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15356 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15362 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15363 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15364 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15365 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15386 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15442 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15524 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 15525 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 20876 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 20894 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 20895 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 20896 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 20897 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 20898 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37509 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37519 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37542 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37550 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37561 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37578 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37622 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37624 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37625 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37626 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37627 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37630 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37639 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37642 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 37698 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 50651 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 51171 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## lang city id send_time collect_time
## 14154 <NA> <NA> 4037 <NA> <NA>
## 14312 <NA> <NA> 4195 <NA> <NA>
## 14432 <NA> <NA> 4315 <NA> <NA>
## 15041 <NA> <NA> 4924 <NA> <NA>
## 15068 <NA> <NA> 4951 <NA> <NA>
## 15070 <NA> <NA> 4953 <NA> <NA>
## 15145 <NA> <NA> 5028 <NA> <NA>
## 15356 <NA> <NA> 5239 <NA> <NA>
## 15362 <NA> <NA> 5245 <NA> <NA>
## 15363 <NA> <NA> 5246 <NA> <NA>
## 15364 <NA> <NA> 5247 <NA> <NA>
## 15365 <NA> <NA> 5248 <NA> <NA>
## 15386 <NA> <NA> 5269 <NA> <NA>
## 15442 <NA> <NA> 5325 <NA> <NA>
## 15524 <NA> <NA> 5407 <NA> <NA>
## 15525 <NA> <NA> 5408 <NA> <NA>
## 20876 <NA> <NA> 10759 <NA> <NA>
## 20894 <NA> <NA> 10777 <NA> <NA>
## 20895 <NA> <NA> 10778 <NA> <NA>
## 20896 <NA> <NA> 10779 <NA> <NA>
## 20897 <NA> <NA> 10780 <NA> <NA>
## 20898 <NA> <NA> 10781 <NA> <NA>
## 37509 <NA> <NA> 27392 <NA> <NA>
## 37519 <NA> <NA> 27402 <NA> <NA>
## 37542 <NA> <NA> 27425 <NA> <NA>
## 37550 <NA> <NA> 27433 <NA> <NA>
## 37561 <NA> <NA> 27444 <NA> <NA>
## 37578 <NA> <NA> 27461 <NA> <NA>
## 37622 <NA> <NA> 27505 <NA> <NA>
## 37624 <NA> <NA> 27507 <NA> <NA>
## 37625 <NA> <NA> 27508 <NA> <NA>
## 37626 <NA> <NA> 27509 <NA> <NA>
## 37627 <NA> <NA> 27510 <NA> <NA>
## 37630 <NA> <NA> 27513 <NA> <NA>
## 37639 <NA> <NA> 27522 <NA> <NA>
## 37642 <NA> <NA> 27525 <NA> <NA>
## 37698 <NA> <NA> 27581 <NA> <NA>
## 50651 <NA> <NA> 40534 <NA> <NA>
## 51171 <NA> <NA> 41054 <NA> <NA>
SMScorpus[rowids]
## [1] "<message id=\"4037\"><text>BEGIN:VCARD"
## [2] "<message id=\"4195\"><text>BEGIN:VCARD"
## [3] "<message id=\"4315\"><text>BEGIN:VCARD"
## [4] "<message id=\"4924\"><text>You are. "
## [5] "<message id=\"4951\"><text>dude i woke up and i was in an armbar. i think he overextended it too. "
## [6] "<message id=\"4953\"><text>Yo david my arm is all janked up cuz my friend arm barred me so i prob cant workout with you until it stops hurting "
## [7] "<message id=\"5028\"><text>O. .. "
## [8] "<message id=\"5239\"><text>BEGIN:VCARD"
## [9] "<message id=\"5245\"><text>BEGIN:VCARD"
## [10] "<message id=\"5246\"><text>BEGIN:VCARD"
## [11] "<message id=\"5247\"><text>BEGIN:VCARD"
## [12] "<message id=\"5248\"><text>BEGIN:VCARD"
## [13] "<message id=\"5269\"><text>BEGIN:VCARD"
## [14] "<message id=\"5325\"><text>BEGIN:VCARD"
## [15] "<message id=\"5407\"><text>BEGIN:VCARD"
## [16] "<message id=\"5408\"><text>BEGIN:VCARD"
## [17] "<message id=\"10759\"><text>im well early to work you bum! im gonna stop home tonight so il see you when i see you. :) "
## [18] "<message id=\"10777\"><text>assalam aliekum, "
## [19] "<message id=\"10778\"><text>assalam, "
## [20] "<message id=\"10779\"><text>my email is <EMAIL> .. "
## [21] "<message id=\"10780\"><text>Assalamaliekum, "
## [22] "<message id=\"10781\"><text>assalam aliekum "
## [23] "<message id=\"27392\"><text>Hey, sorry was sleeping just nw, i would take programmer, if tat's okay for u guys. "
## [24] "<message id=\"27402\"><text>T shirt + shorts "
## [25] "<message id=\"27425\"><text>Tuesday after dinner wan watch movie together? World invasion: battle of LA 9pm de. "
## [26] "<message id=\"27433\"><text>Clouds uploaded. "
## [27] "<message id=\"27444\"><text>The pirelajewellery I get the account already, but still some problem cos they make it only one year de, so i still email-ing them. "
## [28] "<message id=\"27461\"><text>Cs <#> "
## [29] "<message id=\"27505\"><text>haha u need meto "
## [30] "<message id=\"27507\"><text>paisseh I didn't check my "
## [31] "<message id=\"27508\"><text>haha ok anyway in total "
## [32] "<message id=\"27509\"><text>ok!! I go tale a "
## [33] "<message id=\"27510\"><text>everything is nice!! forthe loading screen ah is it possible to draw some "
## [34] "<message id=\"27513\"><text>npnp ! all the new stuff just put in the mingxun folder I renamed it liao so easier for me also! thx! anyway also still "
## [35] "<message id=\"27522\"><text>ok all are nice!!! thank you umm one more thing! my friends commented that our "
## [36] "<message id=\"27525\"><text>hello!! I sent u another email with regards to space platform "
## [37] "<message id=\"27581\"><text>lec over. come to d lab "
## [38] "<message id=\"40534\"><text>Borrow the one at home lah "
## [39] "<message id=\"41054\"><text>Yepyep got them from Istanbul "
We will first generate some additional quantitative variables, based on different summaries of the text messages. Here, you may copy the DB table to a local data frame and use dplyr
to analyze and transform your data. Plots should be generated with ggplot
.
Create a new variable that counts the number of characters in a text message.
texts_vec <- unlist(SMScorpus_db_df$content)
char_counts <- nchar(texts_vec)
Create a new variable that counts the number of words (as in strings that are separated by any type of space) in a text message (you might need to iterate or use an apply
family function).
word_counts <- sapply(strsplit(texts_vec," "), length)
Create a new variable that measures the average length of a word in text message (That is, for each text message measure first the number of characters in each word, and average over those).
word_lengths <- sapply(strsplit(texts_vec, " "), nchar)
word_sums <- sapply(word_lengths, sum)
avg_lengths <- word_sums / word_counts # divide the sum of word lengths by N words
Create a new variable that counts how many exclamation marks appear in a text message. You may use the str_count
function from the stringr
package.
exclamations <- str_count(texts_vec, pattern = "!")
Create a new variable that counts how many question marks appear in a text message. You may use the str_count
function from the stringr
package.
questions <- str_count(texts_vec, pattern = '\\?')
Create a new variable that counts how many ellipsis (“…”) marks appear in a text message. You may use the str_count
function from the stringr
package.
dots <- str_count(texts_vec, pattern = '\\.\\.\\.')
Analyze our new variables!
char_counts
= number of characters in a text messageword_counts
= number of words in a text messageavg_lengths
= average length of words in a text messageexclamations
= number of exclamation marks in a text messagequestions
= number of question marks in a text messagedots
= number of ellipsis in a text messagecounts_df <- data.frame(matrix(NA, nrow = length(char_counts), ncol = 6))
colnames(counts_df) <- c('char_counts', 'word_counts', 'avg_lengths', 'exclamations', 'questions', 'dots')
counts_df$char_counts <- char_counts
counts_df$word_counts <- word_counts
counts_df$avg_lengths <- avg_lengths
counts_df$exclamations <- exclamations
counts_df$questions <- questions
counts_df$dots <- dots
Number of characters
Plot a histogram (normalized to density) of the variable that measures the number of characters in a text message. Select the fill to be red with 0.5 transparency, select the line color to be black. Set the number of bins to 100.
plot1 <- ggplot(counts_df, aes(x=char_counts)) +
geom_histogram(aes(y=..density..), colour="black", fill="red", alpha=0.5, bins=100)
plot1
#### 2.A.ii.
The tail has a weird break a little left of 200. Why do you think it is there? Can you find the exact value based on your qualitative hypothesis (look online to confirm your suspicion)? Add a blue vertical line to the histogram at the cutoff value (try geom_vline
).
plot_data <- ggplot_build(plot1)
plot_data_df <- plot_data$data[[1]]
xval <- plot_data_df$x[18]
yval <- plot_data_df$density[18]
xval
## [1] 156.0909
plot1 + geom_vline(aes(xintercept=xval), color="blue", linetype="dashed", size = 1)
Let us see now if there’s a difference between the groups of people that write messages up to the cutoff value you found. First, make a bar plot that shows the proportions of people that write messages below and above this threshold grouped by age. In order for the plot to be more informative, please plot only for age groups which are not “No response” or “unknown” (do not remove these values from the data, just plot without those).
all_df <- cbind(SMScorpus_db_df, counts_df)
threshold <- char_counts > 156
all_df$above_156 = threshold
nn <- length(threshold)
plot2 <- ggplot(data = all_df[which((all_df$age != "unknown") & (all_df$age != "No response")),]) +
geom_bar(mapping = aes(x = age))
plot_data <- ggplot_build(plot2)
plot_data_df <- plot_data$data[[1]]
nn_vec <- plot_data_df$count # this quantity is the number of texts in each age group
nn_vec <- rep(nn_vec,c(2,2,2,2,2,2,2,1))
ggplot(data = all_df[which((all_df$age != "unknown") & (all_df$age != "No response")),]) +
geom_bar(mapping = aes(x = age, fill = above_156), position = "dodge") +
ggtitle("Number of texts with character length above or below 156 in each age group")
# The percent is given by: (# of texts that are True or False in a particular age group) / (# texts in that particular age group)
ggplot(data = all_df[which((all_df$age != "unknown") & (all_df$age != "No response")),]) +
geom_bar(mapping = aes(x = age, fill = above_156, y = ..count../nn_vec), position = "dodge") +
scale_y_continuous(name = "%", labels=scales::percent) +
ggtitle("% of texts in each age group that have a character length above or below 156")
What did you learn from the plot?
all_df$gender <- gsub("F", "f", all_df$gender)
plot3 <- ggplot(data = all_df[which(all_df$gender != "unknown"),]) +
geom_bar(mapping = aes(x = gender))
plot_data <- ggplot_build(plot3)
plot_data_df <- plot_data$data[[1]]
nn_vec <- plot_data_df$count # this quantity is the number of texts in each gender
nn_vec <- rep(nn_vec,c(2,2))
ggplot(data = all_df[which(all_df$gender != "unknown"),]) +
geom_bar(mapping = aes(x = gender, fill = above_156), position = "dodge") +
ggtitle("Number of texts with character length above or below 156 in each gender")
ggplot(data = all_df[which(all_df$gender != "unknown"),]) +
geom_bar(mapping = aes(x = gender, fill = above_156, y = ..count../nn_vec), position = "dodge") +
scale_y_continuous(name = "%", labels=scales::percent) +
ggtitle("% of texts in each gender that have a character length above or below 156")
What did you learn from the plot?
Number of words
Make a box plot that best demonstrates how the distributions of the number of words change with age, by gender.
ggplot(data = all_df[which((all_df$age != "unknown") & (all_df$age != "No response") & (all_df$gender != "unknown") & (all_df$age < 46)),], mapping = aes(x = word_counts, y = age, fill=gender)) +
geom_boxplot()
What do you learn from this plot?
Make a box plot that best demonstrates whether or not the use of smartphone encourages messages that have more words. Before you do so, change all “Yes” entries in the smartphone
column to “yes”.
all_df$smartphone <- gsub("Y", "y", all_df$smartphone)
ggplot(data = all_df, mapping = aes(x = word_counts, y = smartphone)) +
geom_boxplot(color="red", fill="magenta", alpha=0.2)
ggplot(data = all_df[which(all_df$gender != "unknown"),]) +
geom_point(mapping = aes(x = questions, y = exclamations, color = gender), position = "jitter") +
xlim(0,20) + ylim(0,20) + facet_wrap(~ age, nrow = 2)
## Warning: Removed 25681 rows containing missing values (geom_point).