SGR Code from Appendices

R Code for Appendix B: Statistics with R

                                              
#===============================================================================
# Appendix B -- Statistics with R
#===============================================================================

# datasets package -------------------------------------------------------------

str(ToothGrowth)                       # Show structure of ToothGrowth data

TG = ToothGrowth                       # Shorten dataset name
TG$supp2 = as.character(TG$supp)       # Add character version of supp
dim(TG)                                # Show data frame dimensions

# AII.1 Descriptive statistics =========================================== AII.1

summary(TG$len)                        # Summary of a numeric variable
summary(TG$supp)                       # Summary of a factor variable
summary(TG)                            # Get summary of all TG variables

# Individual descriptive statistics --------------------------------------------
mean(TG$len)                           # Mean of a numeric variable
median(TG$len)                         # Median of a numeric variable
min(TG$len)                            # Minimum of a numeric variable
max(TG$len)                            # Maximum of a numeric variable
sd(TG$len)                             # Standard Deviation of numeric variable
var(TG$len)                            # Variance of numeric variable

# Note that sd and var are sample statistics (denominator = n-1).
#  you'll have to calculate the population statistics yourself:

# Calculate population variance ------------------------------------------------
popvar = sum((TG$len - mean(TG$len))^2/length(TG$len))
popsd = sqrt(popvar)                   # Population standard deviation
print.table(c("Sample variance: ",     # Here is printing with some labels
  round(var(TG$len), 2),
  "Population variance: ", 
  round(popvar, 2)))
  
print(c(sd(TG$len), popsd))            # Straight print without labels

# With missing values (not in book) -------------------------------------------- 
# Now the same things for a dataset with missing values.

TG$len[11] = NA                        # Put in a missing value 

popvar2 = sum((TG$len - mean(TG$len, na.rm = T))^2, na.rm = T)/
  (length(which(!is.na(TG$len))))
popsd2 = sqrt(popvar2)

print(c(sd(TG$len), popsd2))           # Straight print without labels

# Treatment of missing values for descriptive statistics -----------------------

mean(TG$len)                           # If we take the mean we get NA
mean(TG$len, na.rm = T)                # This will do it for us
median(TG$len, na.rm = T)              # Likewise for these other values
max(TG$len, na.rm = T)
min(TG$len, na.rm = T)

length(TG$len[is.na(TG$len) == T])     # Two ways to count missing values
summary(TG$len)

# std deviation for a data frame -----------------------------------------------

sapply(TG, sd, na.rm = T)              # Apply std dev function to data frame

# The mode issue ---------------------------------------------------------------

mode(TG$len)                           # Object TYPE-not mode statistic!

# A mode workaround
# Here we start by changing our numeric variable to a factor.  Then we count 
# count the number of times each value appears.  Finally we sort by that 
# count in decreasing order to see the mode.
#
# as.factor() turns the numeric variable into a factor
# split() breaks the var into individual vectors by the values of the factor
# the length function counts how long each of the resulting vectors is
# sapply() applies the length function to each of those vectors
# sort() sorts the resulting lengths from longest to shortest (decreasing=TRUE)

sort(sapply(split(TG$len, as.factor(TG$len)), length), decreasing = T)

# Calculating standard errors --------------------------------------------------

# 1. Calculate standard error (no missing values in dataset)
std.error = sd(TG$len)/sqrt(length(TG$len))
std.error                              # Display standard error

# Calculate standard error (missing values in dataset)

TG$len[5] = NA                         # Switch one value to missing

std.error = sd(TG$len, na.rm = T)/sqrt(length(!is.na(TG$len)))
std.error                              # Display standard error

# Appendix II.2 - Visualizing univariate data ============================ AII.2

# histograms -------------------------------------------------------------------

png(filename = "illustrations/fig-AII-1-univariate visualization 1.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches


par(mfrow=c(1, 2),                     # Set multiple plots
  mai = c(.85, .85, .5, .25))          # Set margins
hist(TG$len)                           # Default histogram: numeric var
hist(TG$len, breaks = 4)               # Histogram w/ about 4 breaks

dev.off()                              # Output png file

png(filename = "illustrations/fig-AII-2-univariate visualization 2.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches


par(mfrow=c(1, 2),                     # Set multiple plots
  mai = c(.5, .5, .25, .25))           # Set margins 
plot(TG$supp)                          # For factors just use plot()
boxplot(TG$len)                        # It doesn't get any easier than this

dev.off()                              # Output png file


# AII.3 - Bivariate descriptives: Measures of Association ================ AII.3

# We'll use the built-in trees data.  For demonstration purposes we'll just
# use the first 26 observations and add a character variable using the 
# built-in LETTERS vector (A-Z)

str(trees)                             # Show structure of built-in trees data
myTrees = trees[1:26,]                 # Use the first 26 observations
myTrees$treeID = LETTERS               # Add a letter ID for each obs
myTrees$even =                         # Add a factor variable for even and
  as.factor(c("even", "odd"))          #   odd observations
summary(myTrees)                       # Summary of new trees data


# Correlation ------------------------------------------------------------------

sapply(myTrees, is.numeric)            # Show which variables are numeric
cor(myTrees[,-c(4, 5)])                # Correlation matrix for vars 1-3
                                       # Generic numeric filter
cor(myTrees[sapply(myTrees, is.numeric)])

# Correlation output rounded ---------------------------------------------------

round(cor(myTrees[-c(4, 5)]), 2)       # Output rounded to two place

# Correlation with missing values ----------------------------------------------

myTrees$Girth[14] = NA                 # Add some missing values
myTrees$Height[3] = NA
myTrees$Volume[21] = NA

cor(myTrees[-c(4, 5)])                 # Cor() chokes on missing values
cor(myTrees[-c(4, 5)],                 # Trying to use every obs produces
  use = "everything")                  #   NA if either var has an NA
cor(myTrees[-c(4, 5)],                 # complete.obs uses only those
  use = "complete.obs")                #   observations that have no NA's
cor(myTrees[-c(4, 5)],                 # Pairwise uses obs that are 
  use = "pairwise.complete.obs")       #   complete for each pair
  
# Bivariate visualizations -----------------------------------------------------

png(filename = "illustrations/fig-AII-3-bivariate visualization.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins

plot(myTrees$Girth,myTrees$Height,     # Default 2 variable scatter plot
  xlim = c(5, 20),ylim = c(40, 100))   # Set axis lengths
abline(                                # Add 2 variable regression line
  lm(myTrees$Height ~ myTrees$Girth))    

dev.off()                              # Output png file 

# pairs plot -------------------------------------------------------------------

png(filename = "illustrations/fig-AII-4-pairs plot.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(0, 0, 0, 0))               # Set margins

pairs(myTrees[-c(4, 5)])               # Pairs to produce bivariate plots

dev.off()                              # Output png file

# AII.4  - Hypothesis testing ============================================ AII.4

# Using edata--environmental data

edata = read.delim("http://www.kktg.net/R/Appendix2Data.txt",
  colClasses =                         # Set column storage modes
    c("character", rep(NA, 8)),        #   1st col char, let R choose rest 
  header = TRUE,                       # Use column headers for variable names
  row.names = 1)                       # Use first column for obs names
edata$Kyoto = as.logical(edata$Kyoto)  # Make signatory status logical obj

summary(edata)                         # Summary of variables

# Outliers ---------------------------------------------------------------------
# Show outliers

print(edata[which(abs(edata$dCO2) > 2.5), c("Kyoto", "dCO2")], digits = 2)

# Drop outliers
edata2 = edata[which(abs(edata$dCO2) < 2.5),]

summary(edata2)
#-------------------------------------------------------------------------------
var.test(                              # Check variances to be sure
  edata2$dCO2[edata2$Kyoto],           #   t-test is appropriate
  edata2$dCO2[!edata2$Kyoto])          # This generates an F statistic.


kCO2 = t.test(                         # Construct t-test to compare mean
  edata2$dCO2[edata2$Kyoto],           #   change in CO2 output between
  edata2$dCO2[!edata2$Kyoto])          #   signatories/non-signatories.

kCO2                                   # Print results
summary(kCO2)                          # Show elements in the t-test
kCO2$statistic                         # Print just the t-statistic
kCO2$p.value                           # Print just the p-value

# AII.5 - Crosstabulation ================================================ AII.5

# Using same edata

edata2$VA98 =                          # Create a dichotomous measure of
  ifelse(edata2$WBVA98 > 0, T, F)      #   voice and accountability
dkxtab = xtabs(                        # Crosstab 1998 VA and Kyoto signing
  formula = ~ edata2$VA98 + edata2$Kyoto)   
dkxtab                                 # Display cell values
summary(dkxtab)                        # Display crosstab results (chi-square)

#  crosstab using chisq.test() on contingency table ----------------------------

VAK = table(                           # Get table showing frequencies
  edata2$VA98,                         #   for high/low V & A and
  edata2$Kyoto)                        #   Kyoto signature status

VAKtest = chisq.test(VAK,              # Chi square test for matrix
  correct = FALSE)                     # Turn off continuity correction

VAKtest                                # Display results

VAKtest$expected                       # Show expected cell values matrix

# AII.6 - Analysis of Variance (ANOVA) =================================== AII.6

edata2$VA = ifelse(edata2$WBVA > 0, T, F)  # Create Voice indicator

# Show compliance means by dem
tapply(edata2$EG,                      # Tapply mean() to Econ Growth
  edata2$Kyoto,                        # Partition data by Kyoto
  FUN = mean,                          # Apply mean()
  na.rm = T)                           # Remove missing observations

# ANOVA analysis ---------------------------------------------------------------

myAOV = aov(EG ~ Kyoto, data = edata2) # ANOVA w/aov()
summary(myAOV)                         # Display results
myAnova = anova(                       # ANOVA w/anova()
  lm(edata2$EG ~ edata2$Kyoto))        #  analyzing a linear model
myAnova                                # Display results

# Use aov() results to get group means
print(model.tables(myAOV,"means"), digits = 2)

# Do boxplot

png(filename = "illustrations/fig-AII-5-boxplot.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(.5, .5, .25, .25))         # Set margins

boxplot(EG ~ Kyoto, data = edata2)     # Produce boxplot
points(1 + jitter(as.integer(edata2$Kyoto)), edata2$EG)

dev.off()                              # Output png file


# AII.7 - Linear Regression ============================================== AII.7

# First we'll check on correlations

cor(edata2$WBVA, log(edata2$GDPc),     # Check correlations
  use="pairwise.complete.obs")         # Use all complete observations

# ------------------------------------------------------------------------------
myModel = lm(                          # Set up a linear model with
  edata2$WBRQ~                         #  Quality of regulation as a
   edata2$WBVA+                        #  function of Voice/Accountability
    log(edata2$GDPc))                  #  and the log of GDP per capita

summary(myModel)                       # Show model results

cor(edata2[sapply(edata2,is.numeric)], # Correlation matrix for numeric vars
  use = "pairwise.complete.obs")         # Use all obs with complete pairs

myDiag = ls.diag(myModel)              # set up model diagnostics
attributes(myDiag)
round(myDiag$correlation,2)            # Show correlation matrix


plot(myModel$fitted.values,            # plot residuals v. fitted
  myDiag$std.res)

plot(myModel)                          # Generate standard lm plots

R Code for Appendix D: Some Style Guidelines

#===============================================================================
# Appendix D - Some style guidelines
#===============================================================================

# The importance of style and some white space here and there
         
# Example 1 - No white space and no comments

myVariable=c(1,4,6,7,9,13,15,52,62,78)
popvar=sum((myVariable-mean(myVariable))^2/length(myVariable));popsd=
sqrt(popvar);print.table(c("Variance: sample -",round(var(myVariable),2),
"population -",round(popvar,2)));print(c(sd(myVariable), popsd))

# Example 2 - White space & comments

# Create some data
myVariable = c(1, 4, 6, 7, 9, 13, 15, 52, 62, 78)

# Calculate population variance
popvar = sum((myVariable - mean(myVariable))^2/length(myVariable))

# Calculate population standard deviation
popsd = sqrt(popvar) 

# Print results with labels
print.table(c("Variance: sample -",
  round(var(myVariable), 2),
  "population -", 
  round(popvar, 2))
)

# Print results without labels
print(c(sd(myVariable), popsd))  

# Indentation ------------------------------------------------------------------
# Double space indentation
myData = read.delim("myDataFile.txt", 
  headers = TRUE, 
  colClasses = c("character", "logical, 
    "logical", "numeric", "factor", 
    "character", "numeric", "Date")
  na.strings = "N/A")

# Parenthetical alignment indentation  
myData = read.delim("myDataFile.txt", 
                    headers = TRUE, 
                    colClasses = c("character", "logical, "logical", 
                                   "numeric", "factor", "character", 
                                   "numeric", "Date")
                    na.strings = "N/A")
  
# Placement of parentheses, brackets, and braces -------------------------------

myFunction = function(x, y, z){          # Open w/ curly bracket on same line
  print("This function doesn't do anything")
}                                      # End w/ curly bracket on own line