d = read.csv("~/Downloads/data visualization in R workshop (Responses) - Form Responses 1 (8).csv")
colnames(d)
## [1] "Timestamp"
## [2] "Email.Address"
## [3] "Position.title"
## [4] "Do.you.plan.on.attending.this.R.workshop.on.data.visualization."
## [5] "I.am.attending.from"
## [6] "Did.you.attend.the.previous.R.workshop.run.by.Ashley..or.watch.it.online.."
## [7] "Previous.experience.with.R"
## [8] "What.software.do.you.currently.use.most.to.analyze.your.data."
## [9] "Favorite.number"
## [10] "favorite.color"
## [11] "Gender"
## [12] "number.of.years.at.FSU..or.your.current.institution."
## [13] "zipcode.of.home.town..5.digits..e.g...32304."
## [14] "Suggestions.of.things.to.include.at.workshop..or.email.me.directly.at.aedwards.psy.fsu.edu."
colnames(d) = c("time","email","position", "attend","from",
"last", "experience","software","favnum",
"favcolor","gender","years","zipcode","suggest")
hist(d$experience)
plot(d$years, d$experience)
library(ggplot2)
ggplot(d, aes(x = gender, y = experience))+
stat_summary(fun.y = mean, geom = "bar")
ggplot(d, aes(x = gender, y = experience))+
stat_summary(fun.y = mean, geom = "bar")+coord_flip()
ggplot(d, aes(x = gender, y = experience))+geom_boxplot()
ggplot(d, aes(software, experience, group = gender, color = gender))+stat_summary(fun.y = mean, geom = "line")
ggplot(d, aes(software, experience, group = gender, color = gender, fill = gender))+
stat_summary(fun.y = mean, geom = "bar", position = position_dodge())
ggplot(d, aes(x = favnum, y = years, color = favcolor))+geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).
cols = c("red"="red", "orange" = "orange", "yellow" = "yellow",
"green" = "green", "blue" = "blue", "purple" = "purple",
"pink" = "pink", "black" = "black", "brown" = "brown")
ggplot(d, aes(x = favnum, y = years, color = favcolor))+geom_point()+
scale_color_manual(values = cols)
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(d, aes(x = favnum, y = years, color = favcolor))+geom_point()+
scale_color_manual(values = cols)+facet_grid(.~gender)
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(d, aes(x = favnum, y = years, color = favcolor))+geom_point()+
scale_color_manual(values = cols)+facet_grid(gender~.)
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience))+geom_point()+
scale_color_manual(values = cols)+facet_grid(gender~.)
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience, alpha = .5))+geom_point()+
scale_color_manual(values = cols)+facet_grid(gender~.)
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience))+geom_point(alpha = .5)+
scale_color_manual(values = cols)+facet_grid(gender~.)
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience))+geom_point(alpha = .5)+
scale_color_manual(values = cols)+facet_grid(gender~.)+
xlab("Favorite Number")+ylab("Number of Years at Current Institution")+
labs(color = "Favorite Color", size = "experience with R")
## Warning: Removed 2 rows containing missing values (geom_point).
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 3.4.4
#remove facets, cannot be done with facets
p = ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience))+
geom_point(alpha = .5)+
scale_color_manual(values = cols)+
xlab("Favorite Number")+
ylab("Number of Years at Current Institution")+
labs(color = "Favorite Color", size = "experience with R")+
theme(legend.position = "none")
ggMarginal(p, type = "histogram")
## Warning: Removed 2 rows containing missing values (geom_point).
ggMarginal(p, type = "boxplot")
## Warning: Removed 2 rows containing missing values (geom_point).
table(d$software)
##
## Excel GraphPad other R SAS SPSS Stata
## 2 4 2 3 9 2 23 1
soft = data.frame(table(d$software))
soft
## Var1 Freq
## 1 2
## 2 Excel 4
## 3 GraphPad 2
## 4 other 3
## 5 R 9
## 6 SAS 2
## 7 SPSS 23
## 8 Stata 1
colnames(soft) = c("software", "frequency")
ggplot(soft, aes(x = "", y = frequency, fill = software))+
geom_bar(stat = "identity", width = 1)
ggplot(soft, aes(x = "", y = frequency, fill = software))+
geom_bar(stat = "identity", width = 1)+coord_polar("y")
ggplot(soft, aes(x = "", y = frequency, fill = software))+
geom_bar(stat = "identity", width = 1)+coord_polar("y")+theme_void()
ggplot(soft, aes(x = "", y = frequency, fill = software))+
geom_bar(stat = "identity", width = 1)+coord_polar("y")+theme_void()+
geom_text(aes(label = frequency), position = position_stack(vjust = .5))
library(zipcode)
d$zip = clean.zipcodes(d$zipcode)
data("zipcode")
d1 = merge(d, zipcode, by = "zip")
us = ggplot2::map_data("state")
## Warning: package 'maps' was built under R version 3.4.4
ggplot(d1, aes(longitude, latitude))+
geom_polygon(data = us, aes(x = long, y = lat, group = group),color = "gray",fill = NA)
ggplot(d1, aes(longitude, latitude))+
geom_polygon(data = us, aes(x = long, y = lat, group = group),
color = "gray",fill = NA)+
geom_point(alpha = .5)
library(ggcorrplot)
data(mtcars)
corr <- round(cor(mtcars), 1)
ggcorrplot(corr)
ggcorrplot(corr, lab = TRUE, method = "circle")
ggcorrplot(corr, lab = TRUE, method = "circle", color = c("red", "white", "green"))
cor.test.p = function(x){
FUN <- function(x, y) cor.test(x, y)[["p.value"]]
z <- outer(
colnames(x),
colnames(x),
Vectorize(function(i,j) FUN(x[,i], x[,j]))
)
dimnames(z) <- list(colnames(x), colnames(x))
z
}
pmatrix = cor.test.p(mtcars)
ggcorrplot(corr,
lab = TRUE,
method="circle",
colors = c("red", "white", "green"),
insig = "pch",
p.mat = pmatrix,
pch.cex = 10)
#taken from http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
summarydata = summarySE(d, measurevar = "experience", groupvars = "last")
##
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
##
## ozone
summarydata
## last N experience sd se ci
## 1 No 22 3.636364 2.460071 0.5244888 1.0907343
## 2 Yes 24 2.875000 1.940697 0.3961431 0.8194844
ggplot(summarydata, aes(x = last, y = experience))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin = experience-se, ymax = experience+se, width = .2))
#taken from http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
## If there are within-subject variables, calculate adjusted values using method from Morey (2008).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## betweenvars: a vector containing names of columns that are between-subjects variables
## withinvars: a vector containing names of columns that are within-subjects variables
## idvar: the name of a column that identifies each subject (or matched subjects)
## na.rm: a boolean that indicates whether to ignore NA's
## conf.interval: the percent range of the confidence interval (default is 95%)
summarySEwithin <- function(data=NULL, measurevar, betweenvars=NULL, withinvars=NULL,
idvar=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) {
# Ensure that the betweenvars and withinvars are factors
factorvars <- vapply(data[, c(betweenvars, withinvars), drop=FALSE],
FUN=is.factor, FUN.VALUE=logical(1))
if (!all(factorvars)) {
nonfactorvars <- names(factorvars)[!factorvars]
message("Automatically converting the following non-factors to factors: ",
paste(nonfactorvars, collapse = ", "))
data[nonfactorvars] <- lapply(data[nonfactorvars], factor)
}
# Get the means from the un-normed data
datac <- summarySE(data, measurevar, groupvars=c(betweenvars, withinvars),
na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)
# Drop all the unused columns (these will be calculated with normed data)
datac$sd <- NULL
datac$se <- NULL
datac$ci <- NULL
# Norm each subject's data
ndata <- normDataWithin(data, idvar, measurevar, betweenvars, na.rm, .drop=.drop)
# This is the name of the new column
measurevar_n <- paste(measurevar, "_norm", sep="")
# Collapse the normed data - now we can treat between and within vars the same
ndatac <- summarySE(ndata, measurevar_n, groupvars=c(betweenvars, withinvars),
na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)
# Apply correction from Morey (2008) to the standard error and confidence interval
# Get the product of the number of conditions of within-S variables
nWithinGroups <- prod(vapply(ndatac[,withinvars, drop=FALSE], FUN=nlevels,
FUN.VALUE=numeric(1)))
correctionFactor <- sqrt( nWithinGroups / (nWithinGroups-1) )
# Apply the correction factor
ndatac$sd <- ndatac$sd * correctionFactor
ndatac$se <- ndatac$se * correctionFactor
ndatac$ci <- ndatac$ci * correctionFactor
# Combine the un-normed means with the normed results
merge(datac, ndatac)
}