data visualization workshop

d = read.csv("~/Downloads/data visualization in R workshop (Responses) - Form Responses 1 (8).csv")
colnames(d)

##  [1] "Timestamp"                                                                                  
##  [2] "Email.Address"                                                                              
##  [3] "Position.title"                                                                             
##  [4] "Do.you.plan.on.attending.this.R.workshop.on.data.visualization."                            
##  [5] "I.am.attending.from"                                                                        
##  [6] "Did.you.attend.the.previous.R.workshop.run.by.Ashley..or.watch.it.online.."                 
##  [7] "Previous.experience.with.R"                                                                 
##  [8] "What.software.do.you.currently.use.most.to.analyze.your.data."                              
##  [9] "Favorite.number"                                                                            
## [10] "favorite.color"                                                                             
## [11] "Gender"                                                                                     
## [12] "number.of.years.at.FSU..or.your.current.institution."                                       
## [13] "zipcode.of.home.town..5.digits..e.g...32304."                                               
## [14] "Suggestions.of.things.to.include.at.workshop..or.email.me.directly.at.aedwards.psy.fsu.edu."

colnames(d) = c("time","email","position", "attend","from",
                "last", "experience","software","favnum",
                "favcolor","gender","years","zipcode","suggest")

hist(d$experience)

plot(d$years, d$experience)

library(ggplot2)

ggplot(d, aes(x = gender, y = experience))+
  stat_summary(fun.y = mean, geom = "bar")

ggplot(d, aes(x = gender, y = experience))+
  stat_summary(fun.y = mean, geom = "bar")+coord_flip()

ggplot(d, aes(x = gender, y = experience))+geom_boxplot()

ggplot(d, aes(software, experience, group = gender, color = gender))+stat_summary(fun.y = mean, geom = "line")

ggplot(d, aes(software, experience, group = gender, color = gender, fill = gender))+
  stat_summary(fun.y = mean, geom = "bar", position = position_dodge())

ggplot(d, aes(x = favnum, y = years, color = favcolor))+geom_point()

## Warning: Removed 2 rows containing missing values (geom_point).

cols = c("red"="red", "orange" = "orange", "yellow" = "yellow",
         "green" = "green", "blue" = "blue", "purple" = "purple",
         "pink" = "pink", "black" = "black", "brown" = "brown")
ggplot(d, aes(x = favnum, y = years, color = favcolor))+geom_point()+
  scale_color_manual(values = cols)

## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(d, aes(x = favnum, y = years, color = favcolor))+geom_point()+
  scale_color_manual(values = cols)+facet_grid(.~gender)

## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(d, aes(x = favnum, y = years, color = favcolor))+geom_point()+
  scale_color_manual(values = cols)+facet_grid(gender~.)

## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience))+geom_point()+
  scale_color_manual(values = cols)+facet_grid(gender~.)

## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience, alpha = .5))+geom_point()+
  scale_color_manual(values = cols)+facet_grid(gender~.)

## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience))+geom_point(alpha = .5)+
  scale_color_manual(values = cols)+facet_grid(gender~.)

## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience))+geom_point(alpha = .5)+
  scale_color_manual(values = cols)+facet_grid(gender~.)+
  xlab("Favorite Number")+ylab("Number of Years at Current Institution")+
  labs(color = "Favorite Color", size = "experience with R")

## Warning: Removed 2 rows containing missing values (geom_point).

library(ggExtra)

## Warning: package 'ggExtra' was built under R version 3.4.4

#remove facets, cannot be done with facets
p = ggplot(d, aes(x = favnum, y = years, color = favcolor, size = experience))+
  geom_point(alpha = .5)+
  scale_color_manual(values = cols)+
  xlab("Favorite Number")+
  ylab("Number of Years at Current Institution")+
  labs(color = "Favorite Color", size = "experience with R")+
  theme(legend.position = "none")

ggMarginal(p, type = "histogram")

## Warning: Removed 2 rows containing missing values (geom_point).

ggMarginal(p, type = "boxplot")

## Warning: Removed 2 rows containing missing values (geom_point).

table(d$software)

## 
##             Excel GraphPad    other        R      SAS     SPSS    Stata 
##        2        4        2        3        9        2       23        1

soft = data.frame(table(d$software))
soft

##       Var1 Freq
## 1             2
## 2    Excel    4
## 3 GraphPad    2
## 4    other    3
## 5        R    9
## 6      SAS    2
## 7     SPSS   23
## 8    Stata    1

colnames(soft) = c("software", "frequency")
ggplot(soft, aes(x = "", y = frequency, fill = software))+
  geom_bar(stat = "identity", width = 1)

ggplot(soft, aes(x = "", y = frequency, fill = software))+
  geom_bar(stat = "identity", width = 1)+coord_polar("y")

ggplot(soft, aes(x = "", y = frequency, fill = software))+
  geom_bar(stat = "identity", width = 1)+coord_polar("y")+theme_void()

ggplot(soft, aes(x = "", y = frequency, fill = software))+
  geom_bar(stat = "identity", width = 1)+coord_polar("y")+theme_void()+
  geom_text(aes(label = frequency), position = position_stack(vjust = .5))

library(zipcode)
d$zip = clean.zipcodes(d$zipcode)
data("zipcode")
d1 = merge(d, zipcode, by = "zip")
us = ggplot2::map_data("state")

## Warning: package 'maps' was built under R version 3.4.4

ggplot(d1, aes(longitude, latitude))+
  geom_polygon(data = us, aes(x = long, y = lat, group = group),color = "gray",fill = NA)

ggplot(d1, aes(longitude, latitude))+
  geom_polygon(data = us, aes(x = long, y = lat, group = group),
               color = "gray",fill = NA)+
  geom_point(alpha = .5)

library(ggcorrplot)
data(mtcars)
corr <- round(cor(mtcars), 1)
ggcorrplot(corr)

ggcorrplot(corr, lab = TRUE, method = "circle")

ggcorrplot(corr, lab = TRUE, method = "circle", color = c("red", "white", "green"))

cor.test.p = function(x){
  FUN <- function(x, y) cor.test(x, y)[["p.value"]]
  z <- outer(
    colnames(x), 
    colnames(x), 
    Vectorize(function(i,j) FUN(x[,i], x[,j]))
  )
  dimnames(z) <- list(colnames(x), colnames(x))
  z
}

pmatrix = cor.test.p(mtcars)
ggcorrplot(corr,
           lab = TRUE,  
           method="circle", 
           colors = c("red", "white", "green"), 
           insig = "pch",
           p.mat = pmatrix,
           pch.cex = 10)

#taken from http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
  library(plyr)
  
  # New version of length which can handle NA's: if na.rm==T, don't count them
  length2 <- function (x, na.rm=FALSE) {
    if (na.rm) sum(!is.na(x))
    else       length(x)
  }
  
  # This does the summary. For each group's data frame, return a vector with
  # N, mean, and sd
  datac <- ddply(data, groupvars, .drop=.drop,
                 .fun = function(xx, col) {
                   c(N    = length2(xx[[col]], na.rm=na.rm),
                     mean = mean   (xx[[col]], na.rm=na.rm),
                     sd   = sd     (xx[[col]], na.rm=na.rm)
                   )
                 },
                 measurevar
  )
  
  # Rename the "mean" column    
  datac <- rename(datac, c("mean" = measurevar))
  
  datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean
  
  # Confidence interval multiplier for standard error
  # Calculate t-statistic for confidence interval: 
  # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
  ciMult <- qt(conf.interval/2 + .5, datac$N-1)
  datac$ci <- datac$se * ciMult
  
  return(datac)
}

summarydata = summarySE(d, measurevar = "experience", groupvars = "last")

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:maps':
## 
##     ozone

summarydata

##   last  N experience       sd        se        ci
## 1   No 22   3.636364 2.460071 0.5244888 1.0907343
## 2  Yes 24   2.875000 1.940697 0.3961431 0.8194844

ggplot(summarydata, aes(x = last, y = experience))+geom_bar(stat = "identity")+
  geom_errorbar(aes(ymin = experience-se, ymax = experience+se, width = .2))

#taken from http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
## If there are within-subject variables, calculate adjusted values using method from Morey (2008).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   betweenvars: a vector containing names of columns that are between-subjects variables
##   withinvars: a vector containing names of columns that are within-subjects variables
##   idvar: the name of a column that identifies each subject (or matched subjects)
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySEwithin <- function(data=NULL, measurevar, betweenvars=NULL, withinvars=NULL,
                            idvar=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) {
  
  # Ensure that the betweenvars and withinvars are factors
  factorvars <- vapply(data[, c(betweenvars, withinvars), drop=FALSE],
                       FUN=is.factor, FUN.VALUE=logical(1))
  
  if (!all(factorvars)) {
    nonfactorvars <- names(factorvars)[!factorvars]
    message("Automatically converting the following non-factors to factors: ",
            paste(nonfactorvars, collapse = ", "))
    data[nonfactorvars] <- lapply(data[nonfactorvars], factor)
  }
  
  # Get the means from the un-normed data
  datac <- summarySE(data, measurevar, groupvars=c(betweenvars, withinvars),
                     na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)
  
  # Drop all the unused columns (these will be calculated with normed data)
  datac$sd <- NULL
  datac$se <- NULL
  datac$ci <- NULL
  
  # Norm each subject's data
  ndata <- normDataWithin(data, idvar, measurevar, betweenvars, na.rm, .drop=.drop)
  
  # This is the name of the new column
  measurevar_n <- paste(measurevar, "_norm", sep="")
  
  # Collapse the normed data - now we can treat between and within vars the same
  ndatac <- summarySE(ndata, measurevar_n, groupvars=c(betweenvars, withinvars),
                      na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)
  
  # Apply correction from Morey (2008) to the standard error and confidence interval
  #  Get the product of the number of conditions of within-S variables
  nWithinGroups    <- prod(vapply(ndatac[,withinvars, drop=FALSE], FUN=nlevels,
                                  FUN.VALUE=numeric(1)))
  correctionFactor <- sqrt( nWithinGroups / (nWithinGroups-1) )
  
  # Apply the correction factor
  ndatac$sd <- ndatac$sd * correctionFactor
  ndatac$se <- ndatac$se * correctionFactor
  ndatac$ci <- ndatac$ci * correctionFactor
  
  # Combine the un-normed means with the normed results
  merge(datac, ndatac)
}

data visualization workshop

Ashley Edwards

February 12, 2019