R Code for Chapter 1: Introduction
#===============================================================================
# Chapter 1 -- Introduction
#===============================================================================
# 1.1a - R commands ======================================================= 1.1a
# This comment identifies the purpose of this R-script
# It is really just to demonstrate what R commands look like.
# R Doesn't care about blank lines. My first command will create an
# x variable and give it a value. Although it is reasonabley obvious,
# I'll also include a comment on the right explaining what I am doing.
myX1 = 7 # This assigns the value 7 to myX
myX2 = c(1, 5, 3, 8, 2, 4) # Create myX2 with a set of values
mean(myX2) # Find the mean for myX2
# Voila! Our first R program. Now we can highlight those commands and
# use the "Run" button to send them to the R console for execution.
# 1.3b Command conventions ================================================ 1.3b
myVar = c(1, 2, 4, NA, 7) # Create a variable w/some data
mean(myVar, na.rm = T) # Mean of myVar w/o missing values
length(myVar) # Num of observations in myVar
ls() # List all objects
# Command short and alternative forms
myRandNum = rnorm(1) # A single random number from a normal
# distribution assuming defaults
myRandNum = rnorm(1, 0, 1) # The same thing showing defaults
myRandNum = rnorm(1, # A random number from normal dist
mean = 0, # with mean = 0
sd = 1) # and std dev = 1
myRandNum = rnorm(n = 1, # A random number from normal dist
sd = 1, # with std dev = 1
mean = 0) # and mean = 0
# Using help examples ----------------------------------------------------------
example(mean) # Run example from mean() help page
# Note: The other examples from Chapter 1 are not included here, as they aren't
# really meant to be executed.
R Code for Chapter 2: A Sample Session
#===============================================================================
# Chapter 2 -- A sample session
#===============================================================================
options(width = 75) # Limit output width to fit book format
# Read data from web (Note - this will change to a data site on
# www.sagepub.com/gaubatz
myDF = read.delim("http://www.kktg.net/R/Chapter2Data.txt",
header = TRUE, # Data has headers for var names
colClasses = c("character", # Set storage modes of variables
"numeric", "numeric", "numeric", # This is just a convenience and
"numeric", "factor", "numeric")) # could be adjusted afterwards
summary(myDF) # Summarize dataset
# Standard deviations ----------------------------------------------------------
sd(myDF$GDPc, na.rm = T) # Get standard deviations
sd(myDF$MilSpend, na.rm = T) # removing missing values
sd(myDF$EdSpend, na.rm = T)
sd(myDF$FemLife, na.rm = T)
sd(myDF$PressStat, na.rm = T)
sd(myDF$PressFree, na.rm = T) # Note R will do sd of a factor
# A simple plot ----------------------------------------------------------------
png(filename = "illustrations/fig-2-1-femlife plot.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25))
plot(myDF$PressStat, myDF$FemLife) # Plot: femlife by free press
dev.off() # Output png file
# Hypothesis testing -----------------------------------------------------------
# Check equality of variances
var.test( # Run a test comparing variances
myDF$FemLife[myDF$PressFree == "F"], # FemLife in states w/ free press
myDF$FemLife[myDF$PressFree == "NF"])# compared to states w/o free press
# t-test -----------------------------------------------------------------------
t.test( # A t-test for the hypothesis that
myDF$FemLife[myDF$PressFree == "F"], # states w/ press freedom and
myDF$FemLife[myDF$PressFree == "NF"])# states w/o press freedom have
# the same female life expectancy
# A simple boxplot -------------------------------------------------------------
png(filename = "illustrations/fig-2-2-femlife boxplot.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(.5, .5, .25, .25)) # Set margins - no Title, no x label
boxplot(myDF$FemLife ~ myDF$PressFree) # Boxplot: femlife by free press status
dev.off() # Output png file
# Look at states missing press freedom status ----------------------------------
myDF$Country.name[myDF$PressFree == "" # List states missing press status
& !is.na(myDF$FemLife)] # but not missing f life exp.
# Getting the factor levels sorted ---------------------------------------------
# Here we want to have our levels of press freedom displayed in substantive
# rather than alphabetical order---that is Free, Partly Free, Not Free, rather
# than the alphabetical order. We also want missing values (NAs) to be treated
# as missing rather than as another discrete level.
str(myDF$PressFree) # Show current structure of factor
myDF$PressFree = # Reorder factor levels in
factor(myDF$PressFree, # substantive rather than
levels = c("F", "PF", "NF")) # alphabetical order
str(myDF$PressFree) # Show current structure of factor
# A simple multi-variate model -------------------------------------------------
model1 = lm(FemLife ~ GDPc + MilSpend + PressFree, data = myDF)
summary(model1)
# Another simple plot ----------------------------------------------------------
png(filename = "illustrations/fig-2-3-femlife GDPc plot.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins - no Title
plot(myDF$GDPc, myDF$FemLife) # Plot: femlife by GDPc
dev.off() # Output png file
# Once more, with a log transformation -----------------------------------------
png(filename = "illustrations/fig-2-4-femlife log GDPc plot.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins - no Title
plot(log(myDF$GDPc), myDF$FemLife) # Plot: femlife by ln of GDPc
dev.off() # Output png file
# A simple multi-variate model with log transformations ------------------------
model2 = lm(FemLife ~ log(GDPc) + log(MilSpend) + PressFree,
data = myDF)
summary(model2)
R Code for Chapter 3: R Objects
#===============================================================================
# Chapter 3 -- R Objects
#===============================================================================
# 3.1 R Objects and their Names ============================================ 3.1
# 3.2 How to think about data objects in R ================================= 3.2
# Figure 3.1: A graphic representation of my R objects approach ----------------
# This is just an exercise in drawing with R. The code doesn't really serve
# much purpose beyond that. But, if you are needing to do work with shapes
# following the lead from Chapter 14, you might find some bits of use.
# This starts with the custom ovals function from chapter 10 and the
# custom arrows function from chapter 14 already loaded.
png(filename = "illustrations/fig-3-1-R objects schema.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(0, 0, 0, 0)) # Set margins - no Title
par(usr = c(0, 1, 0, 1)) # Set coordinates to 0,1 space
plot.new() # Start a new plot
oval(xcen = .55, ycen = .5, # Top oval for obj type container
xlen = .175, ylen = .025, # Length of radii
ewidth = 2, # Line width
ecolor = "gray") # Line color
myArrow(x0 = .26, x1 = .445, # Arrow from storage mode to object
y0 = .75, y1 = .75,
angle = 45, # Set arrowhead angle
ljoin = 1, # Set line join type
lwd = 25, # Set line join type
col = "light gray") # Set color
polygon( # Arrow body
x = c(.5, .5, .6, .6),
y = c(.55, .7, .7, .55),
col = "light gray",
border = "light gray")
polygon( # Arrow head
x = c(.55, .475, .625),
y = c(.49, .55, .55),
col = "light gray",
border = "light gray")
myArrow(x0 = .8, x1 = .655, # Arrow from class to object
y0 = .85, y1 = .75,
L = .25, # Set length of arrowhead
angle = 30, # Set arrowhead angle
ljoin = 2, # Set line join type
lwd = 10, # Set line join type
col = "light gray") # Set color
polygon( # Rectangle around Storage Mode
x = c(.001, .001, .249, .249),
y = c(.45, .99, .99, .45),
col = "white",
lwd = 2,
border = "gray")
polygon( # Rectangle around pseudo-Storage Mode
x = c(.001, .001, .249, .249),
y = c(.45, .625, .625, .45),
col = gray(.8),
lwd = 2,
border = "gray")
polygon( # Rectangle around Class
x = c(.8, .8, .95, .95),
y = c(.8, .9, .9, .8),
col = "white",
lwd = 2,
border = "gray")
symbols(x = .55, y = .75, # Filled circle behind R Data Object
circles = .1, # Set circle size
inches = FALSE, # Don't constrain to inch size
add = TRUE, # Add to current plot
bg = "white") # Set background color
oval(xcen = .55, ycen = .75, # Circle around R Data Object in center
xlen = .1, ylen = .1*(3/2), # Y radius adjusted for aspect ratio
ewidth = 6, # Set line size
ecolor = "gray") # Set color
text(x = .55, y = .75, # R Data Object Text
labels = "R Data\nObject",
cex = 1.5, # Set font size
font = 2, # Bold
family = "serif") # Serif Font
text(x = .55, y = .375, # Data Object Type Text
labels = "Data Object Type",
pos = 3, # Above and centered
cex = 1.25, # Set font size
font = 2, # Bold
family = "serif") # Serif Font
text(x = .55, y = .325, # List of Data Object Types
labels = "Vector\nMatrix\nData Frame\nList",
pos = 1, # Below and centered
cex = 1, # Set font size
family = "sans") # Sans serif font
text(x = .125, y = .9, # Data Object Storage Mode Text
labels = "Storage Mode",
pos = 3, # Above and centered
cex = 1.25, # Set font size
font = 2, # Bold
family = "serif") # Serif Font
text(x = .125, y = .85, # Storage Modes List
labels = "Logical\nNumeric\nCharacter\n\n\nDate/Time\nFactor",
pos = 1, # Below and centered
cex = 1, # Set font size
family = "sans") # Sans serif font
text(x = .875, y = .85, # Data Object Class Text
labels = "Class",
cex = 1.25, # Set font size
font = 2, # Bold
family = "serif") # Serif Font
oval(xcen = .55, ycen = .05, # Bottom oval for obj type container
xlen = .175, ylen = .025,
ewidth = 2,
ecolor = "gray")
segments(x0 = .375, x1 = .375, # Left line for object type container
y0 = .05, y1 = .5,
lwd = 2,
col = "gray")
segments(x0 = .725, x1 = .725, # Right line for object type container
y0 = .05, y1 = .5,
lwd = 2,
col = "gray")
dev.off() # Output png file
# 3.3 R Object storage modes =============================================== 3.3
# Create some diverse objects
# Non-Data objects # A function to add 10 to any number
myFunction = function(x){
return(x + 10)
}
myModel = # Output from a regression model
lm(c(1:10) ~ c(1, 3, 2, 5, 4:9))
# Data objects
myInteger = as.integer(4) # An integer (whole number)
myWholeNumber = 5 # A whole number stored as double
myDouble = 3.7 # A numeric-double number
myOtherInteger = as.integer(3.7) # Non-whole num converted to integer
myLogical1 = TRUE # A logical value set to TRUE
myLogical2 = FALSE # A logical value set to FALSE
myCharacter = "Hello World!" # A character string
typeof(myInteger) # Test "typeof" for integer
mode(myInteger) # Test "mode" for integer
typeof(myWholeNumber) # Test "typeof" for whole number
typeof(myDouble) # Test "typeof" for double
mode(myDouble) # Test "mode" for double
myOtherInteger # Show double (3.7) converted to integer
typeof(myOtherInteger) # Show "typeof" for converted to integer
mode(myOtherInteger) # Show "mode" for convert to integer
typeof(myLogical1) # Test "typeof" for logical
typeof(myCharacter) # Test "typeof" for character
# 3.4 R data object Types ================================================== 3.4
myVector = c(1:5)
myDF = data.frame(c(1:5), letters[1:5], LETTERS[1:5])
myList = list(c(1, 2, 3), "This is a list", TRUE)
myList = list("This is a list with my Dataframe", myDF)
DOType = function(x){ # DOType function ----------------------+
# This is a function to identify data object types. I think |
# of object type as a characterization of objects that hold collections |
# of things. These object types are vectors, matrices, data frames, |
# lists, and factors. |
# If none of those types fit, then the function returns a statement |
# that it is not a recognized data type. |
# |
DOT = "" # Set default value for DOT |
if(is.vector(x)){DOT = "vector"} # Check if is a vector |
if(is.matrix(x)){DOT = "matrix"} # Check if is a matrix |
if(is.data.frame(x)){ # Check if is a data frame |
DOT="data frame"} # |
if(is.list(x) & !is.data.frame(x)){ # Check if is a list (and not a |
DOT="list"} # dataframe) |
# |
if(DOT == ""){DOT = paste("Not a", # Print a message if it is none |
"recognized data object")} # of the above |
return(DOT) # Return the appropriate value |
} # End of function ----------------------+
# 3.5 The basic objects: Vectors =========================================== 3.5
myLogicalVector = c(T, F, T, T) # Set up a logical vector
myNumericVector = c(1, 2, 4, 7) # Set up an integer vector
myTextVector = c("a", "b", "7", "x") # Set up a text vector
typeof(myLogicalVector) # Show type for logical vector
typeof(myNumericVector) # Show type for numeric vector
mode(myTextVector) # Show type for text vector
# ------------------------------------------------------------------------------
myScalar = 1 # Create a scalar with value 1
yourScalar = 2 # Create a scalar with value 2
myScalar + yourScalar # Add the two scalars
# ------------------------------------------------------------------------------
myVector1 = c(0, 5, 18) # Set up a numeric vector
typeof(myVector1) # Check vector type
myVector2 = c(TRUE, TRUE, FALSE) # Set up a logical vector
typeof(myVector2) # Check vector type
myVector3 = c("Fred", "Joe", "Simon") # Set up a character vector
typeof(myVector3) # Check vector type
myVector1 + myVector3 # Add num & char vectors (error)
myVector1 + myVector2 # Add numeric and logical vectors
as.logical(myVector1) # Treat numeric vector as logical
# 3.5a Vector indices ===================================================== 3.5a
myVector = c(3, 9, 5) # Set up vector of numeric values
myVector[2] # Get the vector's 2nd element
myVector[3] # Get the vector's 3rd element
# ------------------------------------------------------------------------------
myVector = c(3, 9, 5) # Set up vector of numeric values
myIndex = 1 # Set up a selection index
myVector[myIndex] # Use index for selection
# ------------------------------------------------------------------------------
myVector = c("Bob", "Mary", "Fred") # Set up vector of character values
myVector[2] # Show second element
myIndex = c(1, 3) # Set up an index variable
myVector[myIndex] # Select vector elements w/index
# 3.5b Vector operations ================================================== 3.5b
# Vectorized operation
myVector1 = c(1, 2, 3) # Create vector with 3 elements
myVector2 = log(myVector1) # Create new vector with log
myVector2 # Print new vector
# ------------------------------------------------------------------------------
# non-vectorized operation
myVector = c(1, 2, 3) # Numeric vector with 3 elements
if(myVector == 1) print("Answer is 1") # This produces a likely error
if(myVector[1] == 1) # This time we specify the first element
print("Answer is 1")
myAnswer = ifelse(myVector == 1, # Here is the vectorized ifelse()
"Answer is 1", # it operates on each element in
"Answer is not 1") # the vector individually
myAnswer # Show results
# ------------------------------------------------------------------------------
# Whole vector operations
myVector = c(1, 2, 3) # Create a vector
min(myVector) # Get the vector minimum
max(myVector) # Get the vector maximum
sum(myVector) # Sum the vector elements
# ------------------------------------------------------------------------------
# Vector math
myVector1 = c(1, 2, 3) # A vector with 3 elements
myVector2 = c(10, 20, 30) # Another 3 element vector
myVector1 + myVector2 # Vector addition
myVector1 * myVector2 # Vector multiplication
myVector2/myVector1 # Vector division
myVector5 = c(10, 20, 30, 40, 50) # A vector with 5 elements
myVector1 + myVector3 # Add different length vectors (error)
myVector6 = c(10, 20, 30, 40, 50, 60) # A vector with 6 elements
myVector1 + myVector6 # Add different length vectors when one
# length is a multiple of the other
# 3.6 The basic objects: Matrices and their indices ======================== 3.6
# Matrix math is not the same as matrix algebra!
myMatrix = rbind(c(3, 8), c(23, 33)) # Create matrix by binding two rows
myMatrix # Display myMatrix
myMatrix1 = myMatrix + 7 # Add 7 to each element of myMatrix
myMatrix1 # Display myMatrix1
myMatrix2 = rbind(c(2, 5), c(3, 2)) # Create another matrix by binding rows
myMatrix2 # Display myMatrix2
myMatrix3 = myMatrix1 / myMatrix2 # Divide myMatrix1 by myMatrix2
myMatrix3 # Display myMatrix3
myMatrix4 = myMatrix2 %*% myMatrix3 # Use matrix algebra multiplication
myMatrix2 # Display myMatrix2
myMatrix3 # Display myMatrix3
myMatrix4 # Display myMatrix4
# 3.7 The basic objects: Data frames ======================================= 3.7
# Row and column names
rownames(myData) = # Add row names
c("Mary", "Mike", "Mia", "Mish", "Mark")
colnames(myData) = c("sex", "age") # Add column names
colnames(mydata) = mydata[1,] # Set col names to values in 1st row
# Note that this keeps first row as
# first obs in the data frame
# 3.7a Referencing data frame elements ==================================== 3.7a
# Attach and other methods =====================================================
# Set up a data frame
myDF = data.frame( # Create a data frame
myVar1 = c(seq(0, 100, by = 5)), # Set up variable 1
myVar2 = c(0:20)) # Set up variable 2
# 1. the $ construction --------------------------------------------------------
mean(myDF$myVar1) # This always works & minimizes errors
# 2. The attach() method -------------------------------------------------------
attach(myDF) # Attach the data frame
mean(myVar2) # Do something with the data frame
myVar2 = myVar2 * 2 # Here is a transformation
mean(myVar2) # Mean of transformed variable
detach(myDF) # Detach the data frame
mean(myDF$myVar2) # Transform lost outside of attach
# 3. The data= method ----------------------------------------------------------
lm(myVar1 ~ myVar2, data = myDF) # Do something with the data frame again
# 4. The with() method ---------------------------------------------------------
with(myDF, { # Use with() to indicate data frame
myVar1 = myVar1/2 # Transform myVar1
sd(myVar1) # Std dev. of transformed myVar1
}) # Note close brace & close paren finish
sd(myDF$myVar1) # Transform lost outside with()
# 3.7b Displaying the contents of a data frame ============================ 3.7b
myVar1 = c("a", "b", "c") # Create char variable
myVar2 = c(10, 11, 12) # Create numeric variable
myDF = data.frame(myVar1, myVar2) # Combine into data frame
myDF # Print data frame
names(myDF) # Show names of variables in myDF
summary(myDF) # Summarize variables in data frame
# 3.8 The basic objects: Lists ============================================= 3.8
# A list generated by the lm model ---------------------------------------------
myVar1 = c(1:8) # Set up a y variable
myVar2 = c(3, 5, 4, 6, 7, 9, 2, 9) # Set up an x variable
myModel = lm(myVar2 ~ myVar1) # Create a linear model
myModel # Print the model output
attributes(myModel) # Show the elements in the model output
myModel$residuals # Show the residuals from the model
# Double brackets v. Single bracket list indexing ------------------------------
myModel[2] # Single bracket index result
myModel[[2]] # Double bracket index result
myModel[2][1] # Single [] won't open list item
myModel[[2]][1] # Double [[]] allows list item access
myModel$residuals[1] # $ referencing works the same way
# Create a list ----------------------------------------------------------------
myList = list(1, # List starting with model num (1)
myModel, # Then the myModel list
"This is a discussion of myModel") # Then some discussion of myModel
names(myList) = c("ModelNumber", # Create the list names
"ModelOutput","ModelDiscussion")
myList # Show myList
myList$ModelOutput$residuals # Residuals from myModel in myList
# unlist() ---------------------------------------------------------------------
myList = list(c(1,2,3),c("a","b","c"),"It's numbers and letters!")
typeof(myList) # Show object type for myList
myList # Print myList
myNotList = unlist(myList) # New object = unlisted myList
typeof(myNotList) # Show type of new unlisted object
myNotList # Show my new object
# 3.9 A few things about working with objects ============================== 3.9
# Listing active objects -------------------------------------------------------
objects() # Show all active objects
# Removing objects -------------------------------------------------------------
myVector = 1:10 # Create some objects
myNewVector = myVector+3
myAnimal = "aardvark"
objects() # List the objects
rm(myVector) # Remove an object
objects() # List the objects
rm(list=ls()) # Remove ALL objects
objects() # List the objects
# Object overwriting -----------------------------------------------------------
myNumber = 5 # Assign value 5 to myNumber
myNumber = 7 # Assign value 7 to myNumber
myNumber # Show that 7 replaced 5
myNumber = 5 # Assign value 5 to myNumber
myNumber = myNumber + 4 # Add 4 to myNumber
myNumber # Show new value for myNumber
# 3.10 Object Attributes ================================================== 3.10
# Set attributes for row and column names in a data frame
myData = data.frame(cbind( # Create data frame w/2 vectors
c(1, 0, 1, 1, 0), # Vector 1
c(24, 38, 22, 51, 17))) # Vector 2
# The attr approach
attr(myData, "names") = # Set col names to identify vars
c("sex", "age")
attr(myData, "row.names") = # Set row names to identify obs
c("Mary", "Mike", "Mia", "Mish", "Mark")
myData # Display the data frame
attr(myData, "names")=NULL # Erase attributes
# The rownames/colnames approach
rownames(myData) = # Add row names
c("Mary", "Mike", "Mia", "Mish", "Mark")
colnames(myData) = c("sex", "age") # Add column names
# Using str(0 to show the structure of a data frame ----------------------------
str(myData) # Show structure of myData data frame
# 3.11 Objects and environments =========================================== 3.11
# 3.12 Object classes ===================================================== 3.12
x = 5 # Create an object
class(x) # Show object class
class(mean) # Show class of existing R command
class(x) = "my made up class" # Set new custom class for x
attributes(x) # Show attributes of x
str(x) # Show structure of x
# 3.13 The pseudo storage modes =========================================== 3.13
# 3.14 Date and time as storage modes ===================================== 3.14
# 3.15 Factors ============================================================ 3.15
# Figure 3.2 - A graphic showing factor vectors --------------------------------
# This is another exercise in drawing with R.
# This starts with the custom ovals function from chapter 10 and the
# custom arrows function from chapter 14 already loaded.
png(filename = "illustrations/fig-3-2-R factors.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(0, 0, 0, 0)) # Set margins - no Title
par(usr = c(0, 1, 0, 1)) # Set coordinates to 0,1 space
plot.new() # Start a new plot
text(x = .15, y = .97, # The vector you see text
labels = "The vector\nyou see:",
pos = 1, # Place text below and centered
cex = 1.5, # Set font size
font = 2, # Bold
family = "serif")
text(x = .15, y = .65, # What you see vector
labels='
"Small"\n"Small"\n"Large"\n"Large"\n"Large"\n"Small"\n"Small"',
pos = 1, # Place text below and centered
cex = 1, # Font size
family = "sans")
lines(
x = c(.09, .075, .075, .09), # Add brackets for what you see vector
y = c(.65, .65, .2, .2),
col = "black",
lwd = 2)
lines(
x = c(.21, .225, .225, .21),
y = c(.65, .65, .2, .2),
col = "black",
lwd = 2)
segments(x0 = .3, x1 = .3, # Dividing Line
y0 = .05, y1 = .95,
lwd = 4,
col = "gray")
text(x = .6, y = .97, # Vectors R Sees text
labels = "The vectors R sees:",
pos = 1, # Place text below and centered
cex = 1.5, # Font size
font = 2,
family = "serif")
text(x = .45, y = .89, # Levels Vector text
labels = "The\nLevels\nVector",
pos = 1, # Place text below and centered
cex = 1, # Font size
font = 2, # Bold
family = "serif")
text(x = .45, y = .625, # Levels Vector
labels = 'L1: "L"\nL2: "S"',
pos = 1, # Place text below and centered
cex = 1, # Font size
family = "sans")
lines( # Add brackets for levels vector
x = c(.39, .375, .375, .39),
y = c(.65, .65, .475, .475),
col = "black",
lwd = 2)
lines(
x = c(.51, .525, .525, .51),
y = c(.65, .65, .475, .475),
col = "black",
lwd = 2)
# Pointers vector text
text(x = .6, y = .89,
labels = "The\nPointers\nVector",
pos = 1, # Place text below and centered
cex = 1, # Font size
font = 2, # Bold
family = "serif")
text(x = .6, y = .62, # Pointers vector
labels = "2\n2\n1\n1\n1\n2\n2",
pos = 1, # Place text below and centered
cex = 1.05, # Font size
family = "sans")
lines( # Add brackets for pointers vector
x = c(.565, .55, .55, .565),
y = c(.65, .65, .2, .2),
col = "black",
lwd = 2)
lines(
x = c(.635, .65, .65, .635),
y = c(.65, .65, .2, .2),
col = "black",
lwd = 2)
text(x = .75, y = .89, # Labels vector text
labels = "The\nLabels\nVector\n(optional)",
pos = 1, # Place text below and centered
cex = 1, # Font size
font=2, # Bold
family = "serif")
text(x = .75, y = .625, # Labels vector
labels = '"Large"\n"Small"',
pos = 1, # Place text below and centered
cex = 1, # Font size
family = "sans")
lines( # Add brackets for labels vector
x = c(.69, .675, .675, .69),
y = c(.65, .65, .475, .475),
col = "black",
lwd = 2)
lines(
x = c(.81, .825, .825, .81),
y = c(.65, .65, .475, .475),
col = "black",
lwd = 2)
dev.off() # Output png file
# Factor levels and labels -----------------------------------------------------
c2ltr = c("FR", "UK", "SW", # Create a variable with 2 letter
"NK", "SO") # country abbreviations
cname = c("France", "U.K.", "Sweden", # Create a variable with
"North Korea", "Somalia") # country names.
country = c("UK", "SW", "FR", # Create dataset w/country codes
"SO", "NK")
regime = c(rep("dem", 3), # and democratic status
rep("nondem", 2))
nations.df = # Join into a data frame
data.frame(regime, country)
nations.df # Print the new data frame
#now add labels
nations.df$country = # Create a country factor
factor(nations.df$country,
levels = c2ltr, # Connecting 2 letter codes
labels = cname) # with country names
nations.df # Print data frame
# Controlling factors with the factor() command --------------------------------
# Set up the factor from figure 3.2
myFactor = factor(c("S", "S", "L", "L", "L", "S", "S"))
str(myFactor) # Show factor structure
# Change the ordering of the levels
myFactor = factor(myFactor, # Use values from current factor
levels=c("S", "L")) # Specify the levels
str(myFactor) # Show factor structure
# Add factor labels
myFactor = factor(myFactor, # Use values from current factor
labels=c("Small", "Large")) # Specify labels
str(myFactor) # Show factor structure
# Add additional level ---------------------------------------------------------
myFactor = factor(myFactor, # Use values from current factor
levels=c("Small", "Medium", # Specify the levels w/addition
"Large"))
str(myFactor) # Show factor structure
levels(myFactor) = # Use levels() to add new level
c(levels(myFactor), "X-Large") # Combine old levels with new
summary(myFactor) # Summarize myFactor w/new levels
# Add new observations
myFactor = factor(c(myFactor, "Medium"), # Use values from current factor
levels = c("Small", "Medium", "Large")) # Specify the levels w/addition
str(myFactor) # Show factor structure
# Adding observations ----------------------------------------------------------
myFactor2 = c(myFactor, "Medium") # Can't concatenate w/new values
myFactor2 # Show result
myFactor2 = c(myFactor, 2) # Can add to pointer vector
myFactor2 # But, dumps us out of factor mode
is.factor(myFactor2)
# The data frame approach to adding observations -------------------------------
myDF = data.frame(myFactor) # Put the factor into a data frame
myDF = rbind(myDF, "Medium", "Small") # Add 2 new observations
myDF # Show result
myFactor2 = myDF$myFactor # Return to vector status
myFactor2 # Confirm result
# Combining levels -------------------------------------------------------------
myFactor2 # Display the factor
levels(myFactor2) # Show the current levels
levels(myFactor2) = # Modify the levels to combine
c("Small", "Large", "Large") # Medium with Large.
myFactor2 # Show the new version
# Ordered and unordered factors ------------------------------------------------
mySize = c("small", "medium", # Create variable of all sizes
"large", "X-large")
sort(mySize) # Sort (alphabetical default)
mySize = factor(mySize, # Set as factor
levels = mySize, # Set factor levels from variable
ordered = T) # Make it an ordered factor
sort(mySize) # Sort (now ordered)
myData = c("small", "large", # Here is some data w/sizes
"small", "X-large", "medium")
sort(myData) # Sort (alphabetical default)
myData = factor(myData, # Make it a factor
levels = mySize) # Use levels from mySize
sort(myData) # Sort-now based on factor levels
# 3.16 Coercing storage modes ============================================= 3.16
myVector = c(1, 15, 7, "Smith") # Set up a vector
typeof(myVector) # Show type for vector
myVector[2] + 1 # Try math w/2nd element in vector
typeof(myVector[2]) # Type for 2nd element in vector
myVector = as.numeric(myVector) # Vector forced to numeric
myVector # Print vector
typeof(myVector) # Type for forced numeric vector
# 3.17 The curse of number-character-factor confusion ===================== 3.17
myData = c(7, 8, "missing", 8) # Here we simulate the csv input
sum(myData) # If we sum myData we get an error
myData[1] # We see that the value is a character
myData[1] + 2 # We get errors with numeric operations
as.numeric(myData[1]) # Transforming to numeric fixes this
as.numeric(myData[1]) + 2 # Now we can do a numeric operation
# Character variable factors to numbers ----------------------------------------
animal = (c(rep("kangas", 4), # Create a character variable
rep("koalas", 5)))
myData = data.frame(animal) # Putting the data in a data frame
levels(myData$animal) # converts character to factor
aninum = as.numeric(myData$animal) # Create a numeric version
myData = cbind(myData,aninum) # Add that to the data frame
myData # Show results
# The number factor confusion illustrated --------------------------------------
myVar = (c(rep(7, 4), # Create variable w/ "missing"
"missing", rep(8, 5))) # value which forces to character
myData = data.frame(myVar) # which --> factor in data frame
levels(myData$myVar) # Show levels of unwanted factor
myVar2 = as.numeric(myData$myVar) # Convert to numeric
myData = cbind(myData,myVar2) # Add to data frame
myData # Show data
# Factors to characters --------------------------------------------------------
animal = (c(rep("kangas", 4), # Create some data
rep("koalas", 5)))
myData = data.frame(animal) # Putting the data in a data frame
myData$animal[4] # Let's take a look at obs 4
myData$animal[4] = "koalas" # Now we'll change it to "koalas"
myData$animal[4] # Another look at num 4
# Errors from trying to append new factor levels -------------------------------
myData$animal[4] = "hippopotami" # When we try to change it to
# "hippopotami" we get a nasty error
# Solution 1 for the character/factor conundrum -- using the I() function ------
animal = (c(rep("kangas", 4),rep("koalas", 5)))
myData = data.frame(I(animal)) # Putting the data in a data frame
# but force animal to stay character
typeof(my.data$animal) # Just checking on the datatype
myData$animal[4] # Take a look at obs 4
myData$animal[4] = "hippopotami" # Now we can make the change
myData$animal[4] # Voila!
# Solution 2 for the character/factor conundrum -- adding another factor level -
animal = (c(rep("kangas", 4),rep("koalas", 5)))
myData = data.frame(animal) # Putting the data in a data frame
typeof(myData$animal) # Checking on our datatype - integer?!
myData$animal = factor(myData$animal, # We'll add a new level to the mix
levels= # with the levels option
c(levels(myData$animal), # Combining old levels
"hippopotami")) # with out new entry
myData$animal[4] # Let's look at animal[4]
myData$animal[4] = "hippopotami" # Now we can add "hippopotami" because
myData$animal[4] # that level is included in the factor
# Solution 3 for the character/factor conundrum -- manual conversions ----------
animal = (c(rep("kangas", 4), rep("koalas", 5)))
myData = data.frame(animal) # Putting the data in a data frame
typeof(myData$animal) # Check on the data type
myData$animal = # Force the variable back to character
as.character(myData$animal)
typeof(myData$animal) # Recheck the type -- that works!
myData$animal[4]
myData$animal[4] = "hippopotami" # Now we can make the change to obs 4
myData$animal[4]
R Code for Chapter 4: Getting your Data into R
#===============================================================================
# Chapter 4-- Getting your data into R
#===============================================================================
# 4.1 Entering data ======================================================== 4.1
# 4.1a Entering data with the concatenate function ======================== 4.1a
myVar = c(1, 7, 3, 5, 9, 21, 8) # Entering data with concatenate
myVar # Print data
myWords = c("ant", "ball", "clown") # Concatenate words into variable
myWords # Print variable
# 4.1b Joining vectors into matrices and data frames ====================== 4.1b
# cbind
temp = c(78.4, 65.3, 72.9, 81.2) # Setup some data
rain = c(.025, .001, 0, 1.2)
day = c(1, 2, 3, 4)
weather1 = cbind(day, temp, rain) # cbind demonstration
weather1 # Show results
# rbind
day1 = c(.025, 78.4) # Setup some data
day2 = c(.001, 65.3)
day3 = c(0, 72.9)
day4 = c(1.2, 81.2)
weather2 = rbind(day1, day2, # rbind demonstration
day3, day4)
weather2 # Show matrix
colnames(weather2) = c("rain", "temp") # Add variable names for columns
weather2 # Show weather2 matrix w/names
# concatenating different length vectors
temp = c(78.4, 65.3, 72.9, 81.2) # Setup some data
rain = c(.025, .001, 0, 1.2)
day = c("Saturday", "Sunday")
weekend.weather = data.frame( # Combine unequal length vectors
cbind(day, temp, rain)) # with cbind
weekend.weather # Show results
# 4.1c Entering data with the R spreadsheet =============================== 4.1c
#open a spreadsheet and edit a list of data
myX = c(1, 2, 3) # Create variable 1
myY = c("a", "b", "c") # Create variable 2
myData = cbind(myX, myY) # Combine in data frame
data.entry(myData) # Open in data entry spreadsheet
#open a new data list with a simple dummy value
x = 1
data.entry(x)
#use the edit method
myX = c(1, 2, 3) # Create variable 1
myY = c("a", "b", "c") # Create variable 2
myData.df = data.frame(myX, myY) # Combine in data frame
myData.df = edit(myData.df) # Use edit() to open spreadsheet
# 4.2 Creating data ======================================================== 4.2
# 4.2a Simple sequence and repetitions ==================================== 4.2a
# Colon sequences
myVar1 = 0:10 # A variable w/ 0-10 sequence
myVar1 # Show result
myVar2 = 10:22 # A variable w/ 10-22 sequence
myVar2 # Show result
myVar3 = -5:5 # A variable w/ -5 to 5 sequence
myVar3 # Show result
# The sequence function
myVar4 = seq(0, 5, by = .5) # A sequence from 0 to 5 by .5
myVar4 # Show result
myVar5 = seq(7, 5, -.25) # A sequence from 7 to 5 by -.25
myVar5 # Show result
# The repeat function
myVar1 = rep(7, 3) # Create variable w/ 3 7's
myVar1 # Show result
myVar2 = c(rep(1, 5), rep(2, 3)) # Create variable w/ 5 1's & 3 2's
myVar2 # Show result
myVar3 = c( # Create variable with
rep("Fred", 2), # 2 "Fred" and
rep("Wilma", 4)) # 4 "Wilma"
myVar3 # Show result
# 4.2b Generating factors ================================================= 4.2b
myFactor = gl(n = 3, # Set up 3 levels
k = 2, # Each repeats twice
length = 6, # Total length is 6
labels = c("small", "med", "large")) # Matching labels for 3 levels
myFactor # Display factor
myFactor2 = gl(2, 4, 10, c("y", "n")) # Set 2 levels w/4 repeats
myFactor2 # Display factor
# 4.2c Random numbers and statistical distributions ======================= 4.2c
# Values from a uniform distribution
myRand = runif(10) # Generate 10 random numbers between 0,1
myRand # Print the generated random numbers
round(myRand, 2) # Print them again, rounded to 2 places
myRand2 = runif(5, min = 0, max = 100) # Generate 5 random values between 1, 100
round(myRand2, 2) # Print them
myRand3 = runif(3, 0, 10) # Generate 3 random values between 0, 10
round(myRand3, 2) # Print them
# Values from a normal distribution:
myNorm = rnorm(10) # 10 values from a standard normal dist.
round(myNorm, 2) # Print them
myNorm2 = # From normal distribution
rnorm(10, mean = 50, sd = 10) # with mean = 50 & std dev = 10
round(myNorm2, 2) # Print them
myNorm2 = rnorm(10000) # Create 10,000 values from normal dist
png(filename = "illustrations/fig-4-1-normal dist histogram.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins - no Title
hist(myNorm2, main = NA) # Display in histogram
dev.off() # Output png file
# 4.3 Importing data ======================================================= 4.3
# 4.3a The working directory ============================================== 4.3a
setwd("C:/data") # Set the working directory
getwd() # Show the working directory
setwd(dirname(choose.files()))
# 4.4 The read command: Overview =========================================== 4.4
# 4.5 Reading from the clipboard =========================================== 4.5
myData = read.delim("clipboard", header = TRUE)
# 4.6 Reading blank delimited data ========================================= 4.6
myData = read.table("myDataFile.txt", header = FALSE)
# 4.7 Reading comma separated values ======================================= 4.7
myData = read.csv("myDataFile.csv", header = TRUE)
# 4.8 Reading tab separated values ========================================= 4.8
myData = read.delim("myDataFile.txt", header = TRUE)
# 4.9 Fixed width data ===================================================== 4.9
myData = read.fwf(filename, width = c(5, 2, 3, 2, 15, 3, 2))
# Reading multiline data
# We set up two vectors with the relevant width instructions for each line
line1 = c(3, -2, 5, 2, 3, 2, 15) # We'll skip the line numbers with the -2
line2 = c(-3, -2, 3, 2) # Skip obs & line numbers on 2nd line
myData = read.fwf("myDataFile.txt", width = list(line1, line2))
# 4.10 Generic tabular data =============================================== 4.10
# Three different kinds of value separators:
myData = read.table("myDataFile.txt", header = TRUE, sep = ",")
myData = read.table("myDataFile.txt", header = TRUE, sep = "/t")
myData = read.table("myDataFile.txt", header = TRUE, sep = "%")
# 4.11 Importing foreign filetypes ======================================== 4.11
library(foreign)
myData = read.dta("c:/data/mydata.dta")
# 4.11a Exporting data in foreign formats ================================ 4.11a
# 4.12 Integrating SQL with R ============================================= 4.12
# 4.13 Extracting data from complex datasources =========================== 4.13
# 4.14 Web scraping ======================================================= 4.14
# 4.15 Dealing with multi-dimensional tables ============================== 4.15
# 4.16 Importing problematic charactters ================================== 4.16
# Strip white example - for removing extra leading or trailing spaces
myData = read.delim("myDataFile.txt", # Read some data
sep = "\t", # sep = tab
strip.white = T, # remove spaces
header = T) # data has headers
# 4.17 More resources ===================================================== 4.17
R Code for Chapter 5: Reviewing and Summarizing Data
#===============================================================================
# Chapter 5 -- Reviewing and summarizing data
#===============================================================================
# Set up some synthetic data to use in chapters five and six
# NOTE: This assumes that the stringsAsFactors option is turned on (the default)
# If stringsAsFactors is turned off, you'll want to enclose the animal variable
# in the as.factor() function: animal = as.factor(c(rep("Mouse",25...)))
ch5data = data.frame( # Create a data frame
year = c(1935:1959, # First variable is years
1940:1949, # Sequence with colon
1935, 1942, 1958, 1964, 1970), # Some more years with commas
animal = c(rep("Mouse", 25), # Next variable is animal type
rep("Kangaroo", 10), # 25 mice, 10 Kangaroos
rep("Elephant", 5)), # 5 Elephants
weight = round(c( # Next is plausible weights
rnorm(25, mean = .06, sd = .006), # normally distributed
rnorm(10, mean = 100, sd = 14), # around a mean
rnorm(5, mean = 12000, # and std. dev
sd = 700)), 3), # and rounded to 3 digits
captive = rep( # Finally a captivity status from
c(TRUE, FALSE, FALSE, FALSE), 10)) # a repeated TRUE/FALSE pattern
# Some overview ----------------------------------------------------------------
attributes(ch5data) # Show data frame attributes
str(ch5data) # Show data frame structure
# data frame dimensions
names(ch5data) # Show variable names in the data frame
dim(ch5data) # Show the dimensions of the data frame
nrow(ch5data) # Show the number of rows
dim(ch5data)[1] # Show the number of rows
ncol(ch5data) # Show the number of columns
dim(ch5data)[2] # Show the number of columns
# 5.1 Summarizing Data ===================================================== 5.1
summary(ch5data) # Summary statistics for ch5data
# Individual summary statistics
length(ch5data$animals) # Show num of obs in animals variable
length(ch5data) # Show num of variables in ch5data
mean(ch5data$year) # Show mean of year
median(ch5data$weight) # Show median of weight
sum(ch5data$weight) # Show sum of variable weight
max(ch5data$year) # Show maximum value of year
min(ch5data$year) # Show minimum value of year
range(ch5data$year) # Show range of variable x3
sd(ch5data$year) # Show standard deviation of year
unique(ch5data$year) # Show unique values in year
# missing values ---------------------------------------------------------------
myVar = c(1, 2, 3, 4, NA, 5) # A variable w/ missing values
mean(myVar) # Mean function returns NA
mean(myVar, na.rm = T) # Mean without missing values
# 5.2 Sampling ============================================================ 5.2
# Displaying the top of the dataset
head(ch5data, 7) # Display first 7 observations
# Sampling for a simple vector
myVector = c(1:1000) # Create a vector
sample(myVector, 5) # A random sample of 5 observations
# Now for a data frame
sample5 = sample(1:nrow(ch5data), 5) # Create a vector of 5 sample values
ch5data[sample5,] # Print sample of data
ch5data[sample(1:nrow(ch5data), 5),] # All in one line
# 5.3 Reviewing Data by Categories ===================================== 5.3
table(ch5data$animal, ch5data$captive) # Create animal x captive table
table(ch5data$captive, ch5data$animal) # Create captive x animal table
# The crosstabs approach (see appendix I) --------------------------------------
xtabs(
ch5data, formula = # Use formula = to set up dimensions
~ animal + captive)
my.xtab = xtabs( # Save the crosstab for later analysis
ch5data,formula =
~ animal + captive)
summary(my.xtab) # Generate Chi-Square statistic for x-tab
# Using the split() function ---------------------------------------------------
lapply(split(ch5data, ch5data$captive), summary)
lapply(split(ch5data, ch5data$animal), summary)
lapply(split(ch5data, as.list(ch5data$captive, ch5data$animal)), summary)
lapply(split(ch5data, ch5data$captive), summary)
lapply(split(ch5data$weight, ch5data$animal), mean)
lapply(split(ch5data, ch5data$animal), median, na.rm = T)
# The cut() function -----------------------------------------------------------
lapply( # Apply a function to a data frame
split(ch5data, # Split up data before applying
cut(ch5data$weight, # Variable to use as factor
breaks = c(0, 50, 500, 100000), # Break points (including ends)
labels = # Labels for the different levels
c("small", "large", "huge"))),
summary) # Function to apply
# 5.4 Histograms =========================================================== 5.4
myVar = seq(-3, 3, .0001) # Set up a variable with .0001 intervals
myNormVar = rnorm(myVar) # Create random normal variable on myVar
png(filename = "illustrations/fig-5-1-histogram.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins - no Title
hist(myNormVar, main = NA) # Histogram plot of normal variable
# with title (main = ) turned off
dev.off()
# Output png plot
# Kernel Density Plot
png(filename = "illustrations/fig-5-2-kernel density.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins - no Title
plot(density(myNormVar), main = NA) # Density plot of normal variable
polygon(density(myNormVar),
col = "gray") # Fill it in with gray if you want
dev.off() # Output png file
# 5.5 Scatterplots ===================================== 5.5
myX = c(1, 5, 19, 7, 6, 18, 11, 10) # Create an x variable
myY = c(4, 3, 12, 7, 8, 9, 15, 9) # Create a y variable
png(filename = "illustrations/fig-5-3-basic scatterplot.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins - no Title
plot(myX, myY) # Simple scatterplot of x & y
dev.off() # Output png file
# adding labels
myLabels = c( # Add a vector of labels for each obs
"Bach", "Beethoven",
"Brahms", "Mozart", "Chopin",
"Tchaikovsky", "Satie", "Bartok")
png(filename = "illustrations/fig-5-4-scatterplot labels.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins - no Title
plot(myX, myY) # Simple scatterplot of x & y
text(myX, myY, # Add labels using x & y coords
labels = myLabels, # The labels to add
pos = 3, # Put labels above points
xpd = TRUE) # Allow printing outside plot
dev.off() # Output png file
# Only add labels to more extreme points (not in book)
myData = data.frame(myX, myY, myLabels)# Package data in data frame
plot(myData$myX, myData$myY) # Scatterplot of myX & myY
myData2 = myData[ # Select cases where
abs(myData$myY - mean(myData$myY)) > # deviation of Y from mean >
sd(myData$myY) & # 1 std dev and where
abs((myData$myY - mean(myData$myY))/ # standardized deviation of Y
sd(myData$myY)) > 2 * # is greater than 2 times the
abs((myData$myX - mean(myData$myY))/ # standardized deviation of X
sd(myData$myX)),]
text(myData2$myX, myData2$myY, # Use coordinates of extreme points
myData2$myLabels, # to place labels
pos = 4) # to the right of the points
# 5.6 Pairs Plots ========================================================== 5.6
myZ = c(3, 9, 12, 2, 2, 17, 1, 8) # Add z variable to x & y from sect 5.5
myData = data.frame(myX, myY, myZ) # Combine x,y,z into data frame
png(filename = "illustrations/fig-5-5-pairs plot.png",
units = "in", # Set measurements in inches
res = 1200, # Set resolution at 1200dpi
width = 6, # Width at 6 inches
height = 4) # Height at 4 inches
par(mai = c(1, 1, .25, .25)) # Set margins - no Title
pairs(myData) # Pairs plot for data frame
dev.off() # Output png file
R Code for Chapter 6: Sorting and Selecting Data
#===============================================================================
# Chapter 6 -- Sorting and selecting data
#===============================================================================
# We'll use the same data we set up at the beginning of chapter 5 throughout
# this chapter
# 6.1 Using index values for selection ===================================== 6.1
selection = c(1, 5, 7, 11:14) # Set a vector of rows to select
myDF = ch5data[selection,] # Create data frame w/selected rows
myDF = ch5data[c(1, 5, 7, 11:14),] # Create data frame w/selected rows
myDF # Print myDF
myDF[-2,] # Print myDF w/o observation 2
myDF2 = myDF[,-c(1, 3)] # Drop columns 1 & 3
myDF4 = myDF[-c(1, 3, 7),] # Drop rows 1,3 & 7
# 6.2 Selecting with conditional values===================================== 6.2
myDF = ch5data # A shorter DF name for convenience
kDF = myDF[myDF$animal == "Kangaroo",] # Select all kangaroos
head(kDF,3) # First 3 kangaroo observations
bigkDF = # New data frame for big kangaroos
myDF[myDF$animal == "Kangaroo" & # Select all kangaroos
myDF$weight>100,] # weighing more than 100 lbs
# The which() approach ---------------------------------------------------------
kselect = which(myDF$weight>100 & # Create a vector of obs numbers
myDF$animal == "Kangaroo") # with Kangaroos > 165 lbs
kselect # Print selection vector
bigkDF2 = myDF[kselect,] # Create data frame w/selected
bigkDF2 # Print data frame
# 6.3 Using subset() ======================================================= 6.3
myData = data.frame(
myV1 = c(10, 20, 30, 40), # Set up 3 variables in a data frame
myV2 = c("a", "b", "c", "d"),
myV3 = c(1, 2, 3, 4))
myData # Display the data frame
# Subset by rows
myData1 = subset(myData, myV1>20) # Select rows with myV1>20
myData1 # Display results
myData2 = myData[myV1>20,] # Same selection with bracket method
myData2 # Display results
# Subset by columns
myData3 = subset(myData, # Create a data subset
select = myV1) # with just myV1
myData3 # Display the result
myData4 = subset(myData, # Create a data subset
select = -myV3) # without myV3
myData4 # Display the result
myData3b = myData[,1] # Same result w/bracket approach
myData3b # Display result
myData4b = myData[,-3] # Same result w/bracket approach
myData4b # Display result
myData3c = myData[,"myV1"] # Using variable name to select col
myData3c # Display result
myData4c = myData[,c("myV1", "myV2")] # Use variable name for col selection
myData4c # Display result
# Subsetting with a vector
myData5 = subset( # Create a subset with just
myData, select = c(myV1, myV3)) # myV1 and myV3
myData5 # Display the result
myData6 = subset( # Create a subset without
myData, select = -c(myV2, myV3)) # myV2 and myV3
myData6 # Display the result
mySelector = c("myV1", "myV3") # Create vector to select variables
myData7 = subset( # Subset the data using
myData, select = mySelector) # the mySelector vector
myData7 # Display the result
# 6.4 Splitting data into groups with by() ================================= 6.4
myDF = ch5data # Make sure the data is reset
summary(myDF) # Summary of the whole ch 5 dataset
by(myDF, myDF$animal, summary) # Summary by animal type
by(myDF, #
interaction(myDF$animal, # Interact two factors: animal type
myDF$captive), # and captivity
summary) # Apply the summary function
#6.5 Splitting up continuous numeric data ================================== 6.5
# Splitting data with cut() ----------------------------------------------------
myVar = rnorm(100) # Create var from std. normal dist
myVar2 = cut(myVar, -3:3) # Cut into 6 1 unit segments
table(myVar2) # Show number of obs in each segment
# Supress labels ---------------------------------------------------------------
myVar2 = cut(myVar, -3:3, # Split up myVar
labels = FALSE) # without labels
table(myVar2) # Show number of obs in each segment
# Using quantile() -------------------------------------------------------------
myVar = rnorm(n = 100) # Set up data drawn from normal dist
quantile(myVar) # Default quantiles is quartiles
quantile(myVar, # Set up quintiles manually
probs = (c(0, .2, .4, .6, .8, 1)))
quantile(myVar, # Set up quintiles with seq()
probs = seq(0, 1, by = .2))
quantile(myVar, # Some custom cuts for normal curve
probs = c(.0001, .01, .025, .5, .975, .99, .9999))
# cut and quantile together ----------------------------------------------------
myVar = rnorm(n = 100) # Set up data drawn from normal dist
myVarQ = cut(myVar, # Divide myVar into bins
quantile(myVar), # Use 4 bins w/equal num of obs (default)
include.lowest = T) # Include lowest value in 1st bin
summary(myVarQ) # Show quartiles
head(cbind(myVar, myVarQ)) # Show first 6 observations
myVarQ2 = cut(myVar, # Divide myVar into bins
quantile(myVar, # Use 5 bins w/equal num of obs
prob = seq(0, 1, by = .2)),
include.lowest = T) # Include lowest value in 1st bin
summary(myVarQ2) # Show quintiles
head(cbind(myVar, myVarQ2)) # Show first 6 observations
# 6.6 Sorting and Ordering Data ============================================ 6.6
# 6.6a Sorting a variable ================================================= 6.6a
# Sorting example --------------------------------------------------------------
numbers = c(1, 6, 5, 7, 8) # Some numbers to sort
sort(numbers) # Voila! the numbers sorted
# Ordering example -------------------------------------------------------------
fruit = c("Apple", "banana", # Some fruit data
"apple", "Banana")
fruit = fruit[order(fruit)] # Ordered by fruit
fruit # Display in order
# 6.6b Ordering a data frame ============================================== 6.6b
# Show the difference between order() and sort() -------------------------------
numbers = c(1, 6, 5, 7, 8) # Some numbers to sort
sort(numbers) # Sorted --> order of numbers
order(numbers) # Ordered --> obs nums of sorted obs
# A data frame ordering example ------------------------------------------------
myDF2 = subset(ch5data, # For tractibility in example
year>1940 & year<1945) # take subset of myDF
myDF2 = myDF2[order(myDF2$year),] # Sort myDF2 by year
myDF2 # Display result
# Ordering by two variables ----------------------------------------------------
myDF2 = myDF2[order(myDF2$year, # Order first by year
myDF2$animal),] # and then by animal name
myDF2 # Show result