R Code for Appendix B: Statistics with R
#===============================================================================
# Appendix B -- Statistics with R
#===============================================================================
# datasets package -------------------------------------------------------------
str(ToothGrowth) # Show structure of ToothGrowth data
TG = ToothGrowth # Shorten dataset name
TG$supp2 = as.character(TG$supp) # Add character version of supp
dim(TG) # Show data frame dimensions
# AII.1 Descriptive statistics =========================================== AII.1
summary(TG$len) # Summary of a numeric variable
summary(TG$supp) # Summary of a factor variable
summary(TG) # Get summary of all TG variables
# Individual descriptive statistics --------------------------------------------
mean(TG$len) # Mean of a numeric variable
median(TG$len) # Median of a numeric variable
min(TG$len) # Minimum of a numeric variable
max(TG$len) # Maximum of a numeric variable
sd(TG$len) # Standard Deviation of numeric variable
var(TG$len) # Variance of numeric variable
# Note that sd and var are sample statistics (denominator = n-1).
# you'll have to calculate the population statistics yourself:
# Calculate population variance ------------------------------------------------
popvar = sum((TG$len - mean(TG$len))^2/length(TG$len))
popsd = sqrt(popvar) # Population standard deviation
print.table(c("Sample variance: ", # Here is printing with some labels
round(var(TG$len), 2),
"Population variance: ",
round(popvar, 2)))
print(c(sd(TG$len), popsd)) # Straight print without labels
# With missing values (not in book) --------------------------------------------
# Now the same things for a dataset with missing values.
TG$len[11] = NA # Put in a missing value
popvar2 = sum((TG$len - mean(TG$len, na.rm = T))^2, na.rm = T)/
(length(which(!is.na(TG$len))))
popsd2 = sqrt(popvar2)
print(c(sd(TG$len), popsd2)) # Straight print without labels
# Treatment of missing values for descriptive statistics -----------------------
mean(TG$len) # If we take the mean we get NA
mean(TG$len, na.rm = T) # This will do it for us
median(TG$len, na.rm = T) # Likewise for these other values
max(TG$len, na.rm = T)
min(TG$len, na.rm = T)
length(TG$len[is.na(TG$len) == T]) # Two ways to count missing values
summary(TG$len)
# std deviation for a data frame -----------------------------------------------
sapply(TG, sd, na.rm = T) # Apply std dev function to data frame
# The mode issue ---------------------------------------------------------------
mode(TG$len) # Object TYPE-not mode statistic!
# A mode workaround
# Here we start by changing our numeric variable to a factor. Then we count
# count the number of times each value appears. Finally we sort by that
# count in decreasing order to see the mode.
#
# as.factor() turns the numeric variable into a factor
# split() breaks the var into individual vectors by the values of the factor
# the length function counts how long each of the resulting vectors is
# sapply() applies the length function to each of those vectors
# sort() sorts the resulting lengths from longest to shortest (decreasing=TRUE)
sort(sapply(split(TG$len, as.factor(TG$len)), length), decreasing = T)
# Calculating standard errors --------------------------------------------------
# 1. Calculate standard error (no missing values in dataset)
std.error = sd(TG$len)/sqrt(length(TG$len))
std.error # Display standard error
# Calculate standard error (missing values in dataset)
TG$len[5] = NA # Switch one value to missing
std.error = sd(TG$len, na.rm = T)/sqrt(length(!is.na(TG$len)))
std.error # Display standard error
# Appendix II.2 - Visualizing univariate data ============================ AII.2
# histograms -------------------------------------------------------------------
png(filename = "illustrations/fig-AII-1-univariate visualization 1.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mfrow=c(1, 2), # Set multiple plots
mai = c(.85, .85, .5, .25)) # Set margins
hist(TG$len) # Default histogram: numeric var
hist(TG$len, breaks = 4) # Histogram w/ about 4 breaks
dev.off() # Output png file
png(filename = "illustrations/fig-AII-2-univariate visualization 2.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mfrow=c(1, 2), # Set multiple plots
mai = c(.5, .5, .25, .25)) # Set margins
plot(TG$supp) # For factors just use plot()
boxplot(TG$len) # It doesn't get any easier than this
dev.off() # Output png file
# AII.3 - Bivariate descriptives: Measures of Association ================ AII.3
# We'll use the built-in trees data. For demonstration purposes we'll just
# use the first 26 observations and add a character variable using the
# built-in LETTERS vector (A-Z)
str(trees) # Show structure of built-in trees data
myTrees = trees[1:26,] # Use the first 26 observations
myTrees$treeID = LETTERS # Add a letter ID for each obs
myTrees$even = # Add a factor variable for even and
as.factor(c("even", "odd")) # odd observations
summary(myTrees) # Summary of new trees data
# Correlation ------------------------------------------------------------------
sapply(myTrees, is.numeric) # Show which variables are numeric
cor(myTrees[,-c(4, 5)]) # Correlation matrix for vars 1-3
# Generic numeric filter
cor(myTrees[sapply(myTrees, is.numeric)])
# Correlation output rounded ---------------------------------------------------
round(cor(myTrees[-c(4, 5)]), 2) # Output rounded to two place
# Correlation with missing values ----------------------------------------------
myTrees$Girth[14] = NA # Add some missing values
myTrees$Height[3] = NA
myTrees$Volume[21] = NA
cor(myTrees[-c(4, 5)]) # Cor() chokes on missing values
cor(myTrees[-c(4, 5)], # Trying to use every obs produces
use = "everything") # NA if either var has an NA
cor(myTrees[-c(4, 5)], # complete.obs uses only those
use = "complete.obs") # observations that have no NA's
cor(myTrees[-c(4, 5)], # Pairwise uses obs that are
use = "pairwise.complete.obs") # complete for each pair
# Bivariate visualizations -----------------------------------------------------
png(filename = "illustrations/fig-AII-3-bivariate visualization.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins
plot(myTrees$Girth,myTrees$Height, # Default 2 variable scatter plot
xlim = c(5, 20),ylim = c(40, 100)) # Set axis lengths
abline( # Add 2 variable regression line
lm(myTrees$Height ~ myTrees$Girth))
dev.off() # Output png file
# pairs plot -------------------------------------------------------------------
png(filename = "illustrations/fig-AII-4-pairs plot.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(0, 0, 0, 0)) # Set margins
pairs(myTrees[-c(4, 5)]) # Pairs to produce bivariate plots
dev.off() # Output png file
# AII.4 - Hypothesis testing ============================================ AII.4
# Using edata--environmental data
edata = read.delim("http://www.kktg.net/R/Appendix2Data.txt",
colClasses = # Set column storage modes
c("character", rep(NA, 8)), # 1st col char, let R choose rest
header = TRUE, # Use column headers for variable names
row.names = 1) # Use first column for obs names
edata$Kyoto = as.logical(edata$Kyoto) # Make signatory status logical obj
summary(edata) # Summary of variables
# Outliers ---------------------------------------------------------------------
# Show outliers
print(edata[which(abs(edata$dCO2) > 2.5), c("Kyoto", "dCO2")], digits = 2)
# Drop outliers
edata2 = edata[which(abs(edata$dCO2) < 2.5),]
summary(edata2)
#-------------------------------------------------------------------------------
var.test( # Check variances to be sure
edata2$dCO2[edata2$Kyoto], # t-test is appropriate
edata2$dCO2[!edata2$Kyoto]) # This generates an F statistic.
kCO2 = t.test( # Construct t-test to compare mean
edata2$dCO2[edata2$Kyoto], # change in CO2 output between
edata2$dCO2[!edata2$Kyoto]) # signatories/non-signatories.
kCO2 # Print results
summary(kCO2) # Show elements in the t-test
kCO2$statistic # Print just the t-statistic
kCO2$p.value # Print just the p-value
# AII.5 - Crosstabulation ================================================ AII.5
# Using same edata
edata2$VA98 = # Create a dichotomous measure of
ifelse(edata2$WBVA98 > 0, T, F) # voice and accountability
dkxtab = xtabs( # Crosstab 1998 VA and Kyoto signing
formula = ~ edata2$VA98 + edata2$Kyoto)
dkxtab # Display cell values
summary(dkxtab) # Display crosstab results (chi-square)
# crosstab using chisq.test() on contingency table ----------------------------
VAK = table( # Get table showing frequencies
edata2$VA98, # for high/low V & A and
edata2$Kyoto) # Kyoto signature status
VAKtest = chisq.test(VAK, # Chi square test for matrix
correct = FALSE) # Turn off continuity correction
VAKtest # Display results
VAKtest$expected # Show expected cell values matrix
# AII.6 - Analysis of Variance (ANOVA) =================================== AII.6
edata2$VA = ifelse(edata2$WBVA > 0, T, F) # Create Voice indicator
# Show compliance means by dem
tapply(edata2$EG, # Tapply mean() to Econ Growth
edata2$Kyoto, # Partition data by Kyoto
FUN = mean, # Apply mean()
na.rm = T) # Remove missing observations
# ANOVA analysis ---------------------------------------------------------------
myAOV = aov(EG ~ Kyoto, data = edata2) # ANOVA w/aov()
summary(myAOV) # Display results
myAnova = anova( # ANOVA w/anova()
lm(edata2$EG ~ edata2$Kyoto)) # analyzing a linear model
myAnova # Display results
# Use aov() results to get group means
print(model.tables(myAOV,"means"), digits = 2)
# Do boxplot
png(filename = "illustrations/fig-AII-5-boxplot.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(.5, .5, .25, .25)) # Set margins
boxplot(EG ~ Kyoto, data = edata2) # Produce boxplot
points(1 + jitter(as.integer(edata2$Kyoto)), edata2$EG)
dev.off() # Output png file
# AII.7 - Linear Regression ============================================== AII.7
# First we'll check on correlations
cor(edata2$WBVA, log(edata2$GDPc), # Check correlations
use="pairwise.complete.obs") # Use all complete observations
# ------------------------------------------------------------------------------
myModel = lm( # Set up a linear model with
edata2$WBRQ~ # Quality of regulation as a
edata2$WBVA+ # function of Voice/Accountability
log(edata2$GDPc)) # and the log of GDP per capita
summary(myModel) # Show model results
cor(edata2[sapply(edata2,is.numeric)], # Correlation matrix for numeric vars
use = "pairwise.complete.obs") # Use all obs with complete pairs
myDiag = ls.diag(myModel) # set up model diagnostics
attributes(myDiag)
round(myDiag$correlation,2) # Show correlation matrix
plot(myModel$fitted.values, # plot residuals v. fitted
myDiag$std.res)
plot(myModel) # Generate standard lm plots
R Code for Appendix D: Some Style Guidelines
#===============================================================================
# Appendix D - Some style guidelines
#===============================================================================
# The importance of style and some white space here and there
# Example 1 - No white space and no comments
myVariable=c(1,4,6,7,9,13,15,52,62,78)
popvar=sum((myVariable-mean(myVariable))^2/length(myVariable));popsd=
sqrt(popvar);print.table(c("Variance: sample -",round(var(myVariable),2),
"population -",round(popvar,2)));print(c(sd(myVariable), popsd))
# Example 2 - White space & comments
# Create some data
myVariable = c(1, 4, 6, 7, 9, 13, 15, 52, 62, 78)
# Calculate population variance
popvar = sum((myVariable - mean(myVariable))^2/length(myVariable))
# Calculate population standard deviation
popsd = sqrt(popvar)
# Print results with labels
print.table(c("Variance: sample -",
round(var(myVariable), 2),
"population -",
round(popvar, 2))
)
# Print results without labels
print(c(sd(myVariable), popsd))
# Indentation ------------------------------------------------------------------
# Double space indentation
myData = read.delim("myDataFile.txt",
headers = TRUE,
colClasses = c("character", "logical,
"logical", "numeric", "factor",
"character", "numeric", "Date")
na.strings = "N/A")
# Parenthetical alignment indentation
myData = read.delim("myDataFile.txt",
headers = TRUE,
colClasses = c("character", "logical, "logical",
"numeric", "factor", "character",
"numeric", "Date")
na.strings = "N/A")
# Placement of parentheses, brackets, and braces -------------------------------
myFunction = function(x, y, z){ # Open w/ curly bracket on same line
print("This function doesn't do anything")
} # End w/ curly bracket on own line