Joshua Rosenberg
4/14/2017
R is a language
R is free
A flexible statistical analysis toolkit
Graphics and data visualization
Access to powerful, cutting-edge analytics
A robust, vibrant community
R is effectively platform independent
R is what is used by the majority of academic statisticians
This is code (syntax):
# install.packages("tidyverse")
library(tidyverse)
df <- nycflights13::flights
This is code (syntax):
df_ss <- select(df, dep_delay, arr_delay, air_time, distance, carrier)
psych::describe(df_ss)
vars n mean sd median trimmed mad min max range
dep_delay 1 328521 12.64 40.21 -2 3.32 5.93 -43 1301 1344
arr_delay 2 327346 6.90 44.63 -5 -1.03 20.76 -86 1272 1358
air_time 3 327346 150.69 93.69 129 140.03 75.61 20 695 675
distance 4 336776 1039.91 733.23 872 955.27 569.32 17 4983 4966
carrier* 5 336776 9.00 0.00 9 9.00 0.00 9 9 0
skew kurtosis se
dep_delay 4.80 43.95 0.07
arr_delay 3.72 29.23 0.08
air_time 1.07 0.86 0.16
distance 1.13 1.19 1.26
carrier* NaN NaN 0.00
df %>%
select(dep_delay, arr_delay, air_time, distance, carrier) %>%
group_by(carrier) %>%
summarize(dep_delay_mean = mean(dep_delay, na.rm = T))
df %>%
select(dep_delay, arr_delay, air_time, distance, carrier) %>%
group_by(carrier) %>%
summarize(dep_delay_mean = mean(dep_delay, na.rm = T))
# A tibble: 16 × 2
carrier dep_delay_mean
<chr> <dbl>
1 9E 16.725769
2 AA 8.586016
3 AS 5.804775
4 B6 13.022522
5 DL 9.264505
6 EV 19.955390
7 F9 20.215543
8 FL 18.726075
9 HA 4.900585
10 MQ 10.552041
11 OO 12.586207
12 UA 12.106073
13 US 3.782418
14 VX 12.869421
15 WN 17.711744
16 YV 18.996330
df$wday <- lubridate::wday(df$time_hour, label = T)
ggplot(df, aes(x = wday, y = arr_delay)) +
geom_point()
library(ggplot2)
ggplot(df, aes(x = wday, y = arr_delay)) +
geom_jitter()
ggplot(df, aes(x = air_time, y = arr_delay)) +
geom_point()
ggplot(df, aes(x = air_time, y = arr_delay)) +
geom_point()
ggplot(df_ss, aes(x = air_time, y = arr_delay, color = carrier)) +
geom_point()
to_plot <- df %>%
group_by(carrier) %>%
summarize(dep_delay_mean = mean(dep_delay, na.rm = T),
n = n()) %>%
filter(n > 10000) %>%
arrange(desc(dep_delay_mean))
to_plot
# A tibble: 9 × 3
carrier dep_delay_mean n
<chr> <dbl> <int>
1 EV 19.955390 54173
2 WN 17.711744 12275
3 9E 16.725769 18460
4 B6 13.022522 54635
5 UA 12.106073 58665
6 MQ 10.552041 26397
7 DL 9.264505 48110
8 AA 8.586016 32729
9 US 3.782418 20536
So, we should fly with US Airways?
ggplot(to_plot, aes(x = reorder(carrier, dep_delay_mean), y = dep_delay_mean)) +
geom_col() +
theme(text = element_text(size = 16)) +
xlab("Carrier")
lm() syntax (as well as for other formulae in R):
y ~ x1
m1 <- lm(arr_delay ~ air_time, data = df)
arm::display(m1)
lm(formula = arr_delay ~ air_time, data = df)
coef.est coef.se
(Intercept) 9.43 0.15
air_time -0.02 0.00
---
n = 327346, k = 2
residual sd = 44.61, R-Squared = 0.00
y ~ x1 + x2
m2 <- lm(arr_delay ~ air_time + distance, data = df)
arm::display(m2)
lm(formula = arr_delay ~ air_time + distance, data = df)
coef.est coef.se
(Intercept) -1.46 0.17
air_time 0.67 0.01
distance -0.09 0.00
---
n = 327346, k = 3
residual sd = 43.73, R-Squared = 0.04
Adding x1*x2 is equivalent to:
y ~ x1 + x2 + x1:x2
m3 <- lm(arr_delay ~ air_time*distance, data = df)
arm::display(m3)
lm(formula = arr_delay ~ air_time * distance, data = df)
coef.est coef.se
(Intercept) -0.07 0.24
air_time 0.66 0.01
distance -0.09 0.00
air_time:distance 0.00 0.00
---
n = 327346, k = 4
residual sd = 43.72, R-Squared = 0.04
Factors are automatically dummy-coded.
Check factor levels in the output.
Can use levels(df$carrier) <- and functions from library(forcats) to change levels.
m3 <- lm(arr_delay ~ air_time*distance + carrier, data = df)
arm::display(m3)
my_vector <- c(1:10)
This is the result of typing the name of “my_vector”:
my_vector
[1] 1 2 3 4 5 6 7 8 9 10
This finds the mean:
mean(my_vector)
[1] 5.5
What's this thing?
- ? # this is to find out what a function does
- str() # this is to find out the 'structure' of anything
- View() # this allows you to view a data frame (think spreadsheet) or matrix
- class() # this tells you what kind of object this is
How do I pick just one part of this thing?
my_data[1, ] # just the first row of data frame
my_data[, 1] # just the first column of data frame
head(my_data) # first six rows of data frame
tail(my_data) # last six rows of data frame
Loading a comma separated values (CSV) file:
setwd("~/documents") # this sets the working directory
my_data <- readr::read_csv("r_introduction_data.csv") # loads a CSV and saves it to `my_data`
my_data
# A tibble: 212 × 57
Int StudentID Grade Age Gender ClassTeacher SciTeacher PrevIQWST
<int> <chr> <int> <int> <int> <int> <int> <int>
1 1 A.J. Miranda 1 11 0 5 2 NA
2 1 Abby B. 0 10 1 0 0 0
3 0 Abby E 0 10 0 1 0 0
4 0 Abi N. 1 11 1 4 2 NA
5 1 Abigail D. 1 11 1 7 3 NA
6 0 Adam F 1 12 0 5 2 NA
7 NA Adam L. 1 11 0 4 2 NA
8 0 Adam T. 0 9 0 1 0 0
9 1 Addison D. 1 11 1 6 3 NA
10 1 Adelle S. 1 11 1 4 2 NA
# ... with 202 more rows, and 49 more variables: PreEff1 <int>,
# PreEff2 <int>, PreEff3 <int>, PreEff4 <int>, PreEff5 <int>,
# PreInt1 <int>, PreInt2Rev <int>, PreInt3 <int>, PreInt4 <int>,
# PreInt5 <int>, PreVal1 <int>, PreVal2 <int>, PreVal3 <int>,
# PreVal4 <int>, PreVal5 <int>, PreVal6 <int>, PreVal7 <int>,
# PreVal8 <int>, PreEff_Ave <dbl>, PreInt_Ave <dbl>, PreVal_Ave <dbl>,
# PostEff1 <int>, PostEff2 <int>, PostEff3 <int>, PostEff4 <int>,
# PostEff5 <int>, PostInt1 <int>, PostInt2 <int>, PostInt3 <int>,
# PostInt4 <int>, PostInt5 <int>, PostVal1 <int>, PostVal2 <int>,
# PostVal3 <int>, PostVal4 <int>, PostVal5 <int>, PostVal6 <int>,
# PostVal7 <int>, PostVal8 <int>, PostMaintInt1 <int>,
# PostMaintInt2 <int>, PostMaintInt3 <int>, PostMaintInt4 <int>,
# PostEff_Ave <dbl>, PostInt_Ave <dbl>, PostVal_Ave <dbl>,
# PostMaintInt_Ave <dbl>, PreAchievement <int>, PostAchievement <int>
Examples of loading data for different sorts of files
read.delim("filename.txt") # Tab-delimited
readxl::read_excel("filename.xlsx") # Excel
haven::read_sav("filename.sav") # SPSS
my_data %>% count(SciTeacher) # counts frequencies and creates a table
# A tibble: 4 × 2
SciTeacher n
<int> <int>
1 0 53
2 1 55
3 2 53
4 3 51
my_data_ss <- select(my_data, contains("_Ave")) # this selects any variables containing "Ave"
summary(my_data_ss) # creates summary statistics for continuous variables
PreEff_Ave PreInt_Ave PreVal_Ave PostEff_Ave
Min. :2.200 Min. :1.000 Min. :1.875 Min. :1.200
1st Qu.:5.000 1st Qu.:4.800 1st Qu.:5.125 1st Qu.:4.800
Median :5.600 Median :6.200 Median :6.125 Median :5.600
Mean :5.466 Mean :5.739 Mean :5.780 Mean :5.354
3rd Qu.:6.200 3rd Qu.:6.800 3rd Qu.:6.625 3rd Qu.:6.200
Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
NA's :16
PostInt_Ave PostVal_Ave PostMaintInt_Ave
Min. :1.000 Min. :1.500 Min. :1.00
1st Qu.:4.400 1st Qu.:4.875 1st Qu.:3.50
Median :5.800 Median :6.250 Median :5.00
Mean :5.394 Mean :5.691 Mean :4.72
3rd Qu.:6.600 3rd Qu.:6.750 3rd Qu.:6.00
Max. :7.000 Max. :7.000 Max. :7.00
NA's :16 NA's :16 NA's :18
dplyr::select(my_data, PreAchievement, PostAchievement) # Select only certain columns
dplyr::filter(my_data, PreAchievement >= 3) # select only certain rows
dplyr::arrange(my_data, PostAchievement) # arrange data by a variable
dplyr is often used in “pipes”:
my_data %>%
filter(PreAchievement >= 3) %>%
group_by(SciTeacher) %>%
summarize(SciTeacher_mean = mean(SciTeacher))
Example from tidyr documentation
stocks <- data_frame(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 0, 1),
Y = rnorm(10, 0, 2),
Z = rnorm(10, 0, 4)
)
stocks
# A tibble: 10 × 4
time X Y Z
<date> <dbl> <dbl> <dbl>
1 2009-01-01 -1.83043170 -0.3429587 -3.0346175
2 2009-01-02 -0.03840373 -1.2165466 -5.0420255
3 2009-01-03 -0.31792713 -3.5256832 -5.6039987
4 2009-01-04 -0.04249702 -0.4144008 -4.9044614
5 2009-01-05 0.67440794 -0.8861785 5.1264533
6 2009-01-06 0.16987496 5.0971920 4.6215798
7 2009-01-07 -0.98131256 -0.9293778 -4.7451135
8 2009-01-08 0.48264218 -0.1996061 0.9067443
9 2009-01-09 -0.07346683 1.6549908 -5.3301054
10 2009-01-10 -0.23915841 -1.0623524 5.9200525
gather(stocks, stock, price, -time)
# A tibble: 30 × 3
time stock price
<date> <chr> <dbl>
1 2009-01-01 X -1.83043170
2 2009-01-02 X -0.03840373
3 2009-01-03 X -0.31792713
4 2009-01-04 X -0.04249702
5 2009-01-05 X 0.67440794
6 2009-01-06 X 0.16987496
7 2009-01-07 X -0.98131256
8 2009-01-08 X 0.48264218
9 2009-01-09 X -0.07346683
10 2009-01-10 X -0.23915841
# ... with 20 more rows
stocks_long <- gather(stocks, stock, price, -time)
spread(stocks_long, stock, price)
# A tibble: 10 × 4
time X Y Z
* <date> <dbl> <dbl> <dbl>
1 2009-01-01 -1.83043170 -0.3429587 -3.0346175
2 2009-01-02 -0.03840373 -1.2165466 -5.0420255
3 2009-01-03 -0.31792713 -3.5256832 -5.6039987
4 2009-01-04 -0.04249702 -0.4144008 -4.9044614
5 2009-01-05 0.67440794 -0.8861785 5.1264533
6 2009-01-06 0.16987496 5.0971920 4.6215798
7 2009-01-07 -0.98131256 -0.9293778 -4.7451135
8 2009-01-08 0.48264218 -0.1996061 0.9067443
9 2009-01-09 -0.07346683 1.6549908 -5.3301054
10 2009-01-10 -0.23915841 -1.0623524 5.9200525
lme4, nlmelavaan, OpenMxigraph, statnetquanteda, tidytextlibrary(lme4)
model_1 <- lme(engagement ~ challenge + percomp + (1 | program_ID), data = df)
summary(model1)
library(lavaan)
model <- '
# measurement model
ind60 =~ x1 + x2 + x3
dem60 =~ y1 + y2 + y3 + y4
dem65 =~ y5 + y6 + y7 + y8
# regressions
dem60 ~ ind60
dem65 ~ ind60 + dem60
# residual correlations
y1 ~~ y5
y2 ~~ y4 + y6
y3 ~~ y7
y4 ~~ y8
y6 ~~ y8
'
fit <- sem(model, data = PoliticalDemocracy)
summary(fit, standardized = TRUE, fit.measures = T)
library(igraph)
g <- graph.edgelist(edgematrix) # loads two column matrix with ties
V(g)$size = log(degree(g)) # changes size of vertices
V(g)$label <- NA # removes vertex names
V(g)$color[year_1_cohort_bool] <- "blue" # changes vertex color based on logical index
l <- layout.fruchterman.reingold(g) # selects layout
E(g)$weight <- 1 # weights each edge equal to 1
g <- simplify(g, edge.attr.comb = list(weight = "sum")) # sums edgeweights for equivalent ties
plot(g, layout = l,
edge.width = E(g)weight)
library(quanteda)
my_corpus <- corpus(inaugTexts)
summary(my_corpus, n = 3)
Corpus consisting of 58 documents, showing 3 documents.
Text Types Tokens Sentences
1789-Washington 626 1540 23
1793-Washington 96 147 4
1797-Adams 826 2584 37
Source: /Users/joshuarosenberg/Dropbox/1_Research/R_Presentation/* on x86_64 by joshuarosenberg
Created: Fri Apr 14 09:59:07 2017
Notes:
my_dfm <- dfm(my_corpus, ignoredFeatures = stopwords("english"))
library(matchit) # propensity score matching
library(mgcv) # generalized additive models
library(modelr) # helper functions for modeling
library(rvest) # web scraping
library(caret) # machine learning framework
Rmarkdown documents are documents that include plain text that can be styled with very simple syntax as well as executable R code
This presentation was made from an R markdown document
R markdown documents can be turned into nice-looking PDF, word, and html files
Makes sharing results extremely easy
Create web applications using R scripts and analysis
Host them at http://shinyapps.io
Allow others to manipulate and interact with your analysis
Devtools make it easy to develop packages in R
Use a package to group together common functions
Share via GitHub or CRAN (i.e., make package available via install.packages())
Joshua Rosenberg
Some of this presentation was adapted from an earlier presentation with Alex Lishinski