SGR Code from Chapters 1-6

R Code for Chapter 1: Introduction

#===============================================================================
# Chapter 1 -- Introduction
#===============================================================================

# 1.1a - R commands ======================================================= 1.1a

# This comment identifies the purpose of this R-script
# It is really just to demonstrate what R commands look like.

# R Doesn't care about blank lines. My first command will create an
# x variable and give it a value. Although it is reasonabley obvious,
# I'll also include a comment on the right explaining what I am doing.

myX1 = 7                               # This assigns the value 7 to myX

myX2 = c(1, 5, 3, 8, 2, 4)             # Create myX2 with a set of values 

mean(myX2)                             # Find the mean for myX2

# Voila! Our first R program. Now we can highlight those commands and 
# use the "Run" button to send them to the R console for execution.


# 1.3b Command conventions ================================================ 1.3b
myVar = c(1, 2, 4, NA, 7)              # Create a variable w/some data
mean(myVar, na.rm = T)                 # Mean of myVar w/o missing values
length(myVar)                          # Num of observations in myVar
ls()                                   # List all objects


# Command short and alternative forms
myRandNum = rnorm(1)                   # A single random number from a normal
                                       #   distribution assuming defaults
myRandNum = rnorm(1, 0, 1)             # The same thing showing defaults

myRandNum = rnorm(1,                   # A random number from normal dist
  mean = 0,                            #   with mean = 0    
  sd = 1)                              #   and std dev = 1

myRandNum = rnorm(n = 1,               # A random number from normal dist 
  sd = 1,                              #   with std dev = 1
  mean = 0)                            #   and mean = 0

# Using help examples ----------------------------------------------------------

example(mean)                          # Run example from mean() help page


# Note:  The other examples from Chapter 1 are not included here, as they aren't
# really meant to be executed.

R Code for Chapter 2: A Sample Session

#===============================================================================
# Chapter 2 -- A sample session
#===============================================================================

options(width = 75)                    # Limit output width to fit book format  
 
# Read data from web  (Note - this will change to a data site on
# www.sagepub.com/gaubatz

myDF = read.delim("http://www.kktg.net/R/Chapter2Data.txt", 
  header = TRUE,                       # Data has headers for var names
  colClasses = c("character",          # Set storage modes of variables
    "numeric", "numeric", "numeric",   # This is just a convenience and  
    "numeric", "factor", "numeric"))   #   could be adjusted afterwards

summary(myDF)                          # Summarize dataset

# Standard deviations ----------------------------------------------------------
sd(myDF$GDPc, na.rm = T)               # Get standard deviations
sd(myDF$MilSpend, na.rm = T)           #   removing missing values
sd(myDF$EdSpend, na.rm = T)
sd(myDF$FemLife, na.rm = T)
sd(myDF$PressStat, na.rm = T)
sd(myDF$PressFree, na.rm = T)          # Note R will do sd of a factor

# A simple plot ----------------------------------------------------------------
png(filename = "illustrations/fig-2-1-femlife plot.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))

plot(myDF$PressStat, myDF$FemLife)     # Plot: femlife by free press

dev.off()                              # Output png file

# Hypothesis testing -----------------------------------------------------------
# Check equality of variances

var.test(                              # Run a test comparing variances
  myDF$FemLife[myDF$PressFree == "F"], # FemLife in states w/ free press
  myDF$FemLife[myDF$PressFree == "NF"])#  compared to states w/o free press

# t-test -----------------------------------------------------------------------
t.test(                                # A t-test for the hypothesis that
  myDF$FemLife[myDF$PressFree == "F"], #  states w/ press freedom and 
  myDF$FemLife[myDF$PressFree == "NF"])#  states w/o press freedom have
                                       #  the same female life expectancy

# A simple boxplot -------------------------------------------------------------
png(filename = "illustrations/fig-2-2-femlife boxplot.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(.5, .5, .25, .25))         # Set margins - no Title, no x label

boxplot(myDF$FemLife ~ myDF$PressFree) # Boxplot: femlife by free press status

dev.off()                              # Output png file

# Look at states missing press freedom status ----------------------------------

myDF$Country.name[myDF$PressFree == "" # List states missing press status
  & !is.na(myDF$FemLife)]              #  but not missing f life exp.

# Getting the factor levels sorted ---------------------------------------------
# Here we want to have our levels of press freedom displayed in substantive 
# rather than alphabetical order---that is Free, Partly Free, Not Free, rather
# than the alphabetical order. We also want missing values (NAs) to be treated
# as missing rather than as another discrete level.

str(myDF$PressFree)                    # Show current structure of factor
myDF$PressFree =                       # Reorder factor levels in
  factor(myDF$PressFree,               #  substantive rather than 
  levels = c("F", "PF", "NF"))         #  alphabetical order
str(myDF$PressFree)                    # Show current structure of factor

# A simple multi-variate model -------------------------------------------------

model1 = lm(FemLife ~ GDPc + MilSpend + PressFree, data = myDF)
summary(model1)

# Another simple plot ----------------------------------------------------------
png(filename = "illustrations/fig-2-3-femlife GDPc plot.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins - no Title

plot(myDF$GDPc, myDF$FemLife)          # Plot: femlife by GDPc

dev.off()                              # Output png file

# Once more, with a log transformation -----------------------------------------
png(filename = "illustrations/fig-2-4-femlife log GDPc plot.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins - no Title

plot(log(myDF$GDPc), myDF$FemLife)     # Plot: femlife by ln of GDPc

dev.off()                              # Output png file

# A simple multi-variate model with log transformations ------------------------

model2 = lm(FemLife ~ log(GDPc) + log(MilSpend) + PressFree, 
  data = myDF)
summary(model2)

R Code for Chapter 3: R Objects

#===============================================================================
# Chapter 3 -- R Objects
#===============================================================================

# 3.1 R Objects and their Names ============================================ 3.1

# 3.2 How to think about data objects in R ================================= 3.2

# Figure 3.1: A graphic representation of my R objects approach ----------------

# This is just an exercise in drawing with R. The code doesn't really serve
# much purpose beyond that.  But, if you are needing to do work with shapes
# following the lead from Chapter 14, you might find some bits of use.

# This starts with the custom ovals function from chapter 10 and the
# custom arrows function from chapter 14 already loaded.

png(filename = "illustrations/fig-3-1-R objects schema.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(0, 0, 0, 0))               # Set margins - no Title
par(usr = c(0, 1, 0, 1))               # Set coordinates to 0,1 space
                   
plot.new()                             # Start a new plot

oval(xcen = .55, ycen = .5,            # Top oval for obj type container
  xlen = .175, ylen = .025,            # Length of radii
  ewidth = 2,                          # Line width
  ecolor = "gray")                     # Line color

myArrow(x0 = .26, x1 = .445,           # Arrow from storage mode to object
  y0 = .75, y1 = .75,
  angle = 45,                          # Set arrowhead angle
  ljoin = 1,                           # Set line join type
  lwd = 25,                            # Set line join type
  col = "light gray")                  # Set color

polygon(                               # Arrow body
  x = c(.5, .5, .6, .6),
  y = c(.55, .7, .7, .55),
  col = "light gray",
  border = "light gray")
  
polygon(                               # Arrow head
  x = c(.55, .475, .625),
  y = c(.49, .55, .55),
  col = "light gray",
  border = "light gray")

myArrow(x0 = .8, x1 = .655,            # Arrow from class to object
  y0 = .85, y1 = .75,
  L = .25,                             # Set length of arrowhead
  angle = 30,                          # Set arrowhead angle
  ljoin = 2,                           # Set line join type
  lwd = 10,                            # Set line join type
  col = "light gray")                  # Set color
    
polygon(                               # Rectangle around Storage Mode
  x = c(.001, .001, .249, .249),
  y = c(.45, .99, .99, .45),
  col = "white",
  lwd = 2,
  border = "gray")
    
polygon(                               # Rectangle around pseudo-Storage Mode
  x = c(.001, .001, .249, .249),
  y = c(.45, .625, .625, .45),
  col = gray(.8),
  lwd = 2,
  border = "gray")
  
polygon(                               # Rectangle around Class
  x = c(.8, .8, .95, .95),
  y = c(.8, .9, .9, .8),
  col = "white",
  lwd = 2,
  border = "gray")
                                       
symbols(x = .55, y = .75,              # Filled circle behind R Data Object
  circles = .1,                        # Set circle size
  inches = FALSE,                      # Don't constrain to inch size
  add = TRUE,                          # Add to current plot
  bg = "white")                        # Set background color

oval(xcen = .55, ycen = .75,           # Circle around R Data Object in center
  xlen = .1, ylen = .1*(3/2),          # Y radius adjusted for aspect ratio
  ewidth = 6,                          # Set line size
  ecolor = "gray")                     # Set color

text(x = .55, y = .75,                 # R Data Object Text
  labels = "R Data\nObject",
  cex = 1.5,                           # Set font size
  font = 2,                            # Bold
  family = "serif")                    # Serif Font

text(x = .55, y = .375,                # Data Object Type Text
  labels = "Data Object Type",
  pos = 3,                             # Above and centered
  cex = 1.25,                          # Set font size
  font = 2,                            # Bold
  family = "serif")                    # Serif Font

text(x = .55, y = .325,                # List of Data Object Types
  labels = "Vector\nMatrix\nData Frame\nList",
  pos = 1,                             # Below and centered
  cex = 1,                             # Set font size
  family = "sans")                     # Sans serif font

text(x = .125, y = .9,                 # Data Object Storage Mode Text
  labels = "Storage Mode",
  pos = 3,                             # Above and centered
  cex = 1.25,                          # Set font size
  font = 2,                            # Bold
  family = "serif")                    # Serif Font

text(x = .125, y = .85,                # Storage Modes List
  labels = "Logical\nNumeric\nCharacter\n\n\nDate/Time\nFactor",
  pos = 1,                             # Below and centered
  cex = 1,                             # Set font size
  family = "sans")                     # Sans serif font
  
text(x = .875, y = .85,                # Data Object Class Text
  labels = "Class",
  cex = 1.25,                          # Set font size
  font = 2,                            # Bold
  family = "serif")                    # Serif Font


oval(xcen = .55, ycen = .05,           # Bottom oval for obj type container
  xlen = .175, ylen = .025,
  ewidth = 2,
  ecolor = "gray")
 
segments(x0 = .375, x1 = .375,         # Left line for object type container
  y0 = .05, y1 = .5,
  lwd = 2,
  col = "gray")
  
segments(x0 = .725, x1 = .725,         # Right line for object type container
  y0 = .05, y1 = .5,
  lwd = 2,
  col = "gray")   

dev.off()                              # Output png file

# 3.3 R Object storage modes =============================================== 3.3

# Create some diverse objects

# Non-Data objects                     # A function to add 10 to any number
myFunction = function(x){
  return(x + 10)
} 
myModel =                              # Output from a regression model
  lm(c(1:10) ~ c(1, 3, 2, 5, 4:9))   

# Data objects
myInteger = as.integer(4)              # An integer (whole number)
myWholeNumber = 5                      # A whole number stored as double
myDouble = 3.7                         # A numeric-double number
myOtherInteger = as.integer(3.7)       # Non-whole num converted to integer
myLogical1 = TRUE                      # A logical value set to TRUE
myLogical2 = FALSE                     # A logical value set to FALSE
myCharacter = "Hello World!"           # A character string


typeof(myInteger)                      # Test "typeof" for integer
mode(myInteger)                        # Test "mode" for integer
typeof(myWholeNumber)                  # Test "typeof" for whole number
typeof(myDouble)                       # Test "typeof" for double
mode(myDouble)                         # Test "mode" for double
myOtherInteger                         # Show double (3.7) converted to integer
typeof(myOtherInteger)                 # Show "typeof" for converted to integer
mode(myOtherInteger)                   # Show "mode" for convert to integer
typeof(myLogical1)                     # Test "typeof" for logical
typeof(myCharacter)                    # Test "typeof" for character

# 3.4 R data object Types ================================================== 3.4

myVector = c(1:5)
myDF = data.frame(c(1:5), letters[1:5], LETTERS[1:5])
myList = list(c(1, 2, 3), "This is a list", TRUE)
myList = list("This is a list with my Dataframe", myDF)

DOType = function(x){                  # DOType function ----------------------+
# This is a function to identify data object types. I think                    |
# of object type as a characterization of objects that hold collections        |
# of things.  These object types are vectors, matrices, data frames,           |
# lists, and factors.                                                          |
# If none of those types fit, then the function returns a statement            |
# that it is not a recognized data type.                                       |
#                                                                              |
  DOT = ""                             # Set default value for DOT             |
  if(is.vector(x)){DOT = "vector"}     # Check if is a vector                  |
  if(is.matrix(x)){DOT = "matrix"}     # Check if is a matrix                  |
  if(is.data.frame(x)){                # Check if is a data frame              |
    DOT="data frame"}                  #                                       |
  if(is.list(x) & !is.data.frame(x)){  # Check if is a list (and not a         |
    DOT="list"}                        #   dataframe)                          |
                                       #                                       |
  if(DOT == ""){DOT = paste("Not a",   # Print a message if it is none         |
    "recognized data object")}         #   of the above                        |
  return(DOT)                          # Return the appropriate value          |
}                                      # End of function ----------------------+

# 3.5 The basic objects: Vectors =========================================== 3.5

myLogicalVector =  c(T, F, T, T)       # Set up a logical vector
myNumericVector = c(1, 2, 4, 7)        # Set up an integer vector
myTextVector = c("a", "b", "7", "x")   # Set up a text vector
typeof(myLogicalVector)                # Show type for logical vector
typeof(myNumericVector)                # Show type for numeric vector
mode(myTextVector)                     # Show type for text vector

# ------------------------------------------------------------------------------
myScalar = 1                           # Create a scalar with value 1
yourScalar = 2                         # Create a scalar with value 2
myScalar + yourScalar                  # Add the two scalars

# ------------------------------------------------------------------------------

myVector1 = c(0, 5, 18)                # Set up a numeric vector
typeof(myVector1)                      # Check vector type
myVector2 = c(TRUE, TRUE, FALSE)       # Set up a logical vector
typeof(myVector2)                      # Check vector type
myVector3 = c("Fred", "Joe", "Simon")  # Set up a character vector
typeof(myVector3)                      # Check vector type
myVector1 + myVector3                  # Add num & char vectors (error)
myVector1 + myVector2                  # Add numeric and logical vectors
as.logical(myVector1)                  # Treat numeric vector as logical


# 3.5a Vector indices ===================================================== 3.5a

myVector = c(3, 9, 5)                  # Set up vector of numeric values
myVector[2]                            # Get the vector's 2nd element
myVector[3]                            # Get the vector's 3rd element

# ------------------------------------------------------------------------------

myVector = c(3, 9, 5)                  # Set up vector of numeric values
myIndex = 1                            # Set up a selection index
myVector[myIndex]                      # Use index for selection

# ------------------------------------------------------------------------------

myVector = c("Bob", "Mary", "Fred")    # Set up vector of character values
myVector[2]                            # Show second element
myIndex = c(1, 3)                      # Set up an index variable
myVector[myIndex]                      # Select vector elements w/index

# 3.5b Vector operations ================================================== 3.5b

# Vectorized operation
myVector1 = c(1, 2, 3)                 # Create vector with 3 elements
myVector2 = log(myVector1)             # Create new vector with log 
myVector2                              # Print new vector

# ------------------------------------------------------------------------------

# non-vectorized operation
myVector = c(1, 2, 3)                  # Numeric vector with 3 elements
if(myVector == 1) print("Answer is 1") # This produces a likely error
if(myVector[1] == 1)                   # This time we specify the first element
  print("Answer is 1")
myAnswer = ifelse(myVector == 1,       # Here is the vectorized ifelse()
  "Answer is 1",                       #   it operates on each element in
  "Answer is not 1")                   #   the vector individually
myAnswer                               # Show results

# ------------------------------------------------------------------------------

# Whole vector operations
myVector = c(1, 2, 3)                  # Create a vector
min(myVector)                          # Get the vector minimum
max(myVector)                          # Get the vector maximum
sum(myVector)                          # Sum the vector elements

# ------------------------------------------------------------------------------

# Vector math
myVector1 = c(1, 2, 3)                 # A vector with 3 elements
myVector2 = c(10, 20, 30)              # Another 3 element vector
myVector1 + myVector2                  # Vector addition
myVector1 * myVector2                  # Vector multiplication
myVector2/myVector1                    # Vector division
myVector5 = c(10, 20, 30, 40, 50)      # A vector with 5 elements
myVector1 + myVector3                  # Add different length vectors (error)
myVector6 = c(10, 20, 30, 40, 50, 60)  # A vector with 6 elements
myVector1 + myVector6                  # Add different length vectors when one
                                       # length is a multiple of the other

# 3.6 The basic objects: Matrices and their indices ======================== 3.6

# Matrix math is not the same as matrix algebra!
myMatrix = rbind(c(3, 8), c(23, 33))   # Create matrix by binding two rows
myMatrix                               # Display myMatrix
myMatrix1 = myMatrix + 7               # Add 7 to each element of myMatrix
myMatrix1                              # Display myMatrix1

myMatrix2 = rbind(c(2, 5), c(3, 2))    # Create another matrix by binding rows
myMatrix2                              # Display myMatrix2
myMatrix3 = myMatrix1 / myMatrix2      # Divide myMatrix1 by myMatrix2
myMatrix3                              # Display myMatrix3
myMatrix4 = myMatrix2 %*% myMatrix3    # Use matrix algebra multiplication
myMatrix2                              # Display myMatrix2
myMatrix3                              # Display myMatrix3
myMatrix4                              # Display myMatrix4

# 3.7 The basic objects: Data frames ======================================= 3.7

# Row and column names

rownames(myData) =                     # Add row names 
  c("Mary", "Mike", "Mia", "Mish", "Mark")
colnames(myData) = c("sex", "age")     # Add column names 
colnames(mydata) = mydata[1,]          # Set col names to values in 1st row
                                       # Note that this keeps first row as 
                                       #   first obs in the data frame 


# 3.7a Referencing data frame elements ==================================== 3.7a

# Attach and other methods =====================================================
# Set up a data frame
myDF = data.frame(                     # Create a data frame
  myVar1 = c(seq(0, 100, by = 5)),     # Set up variable 1
  myVar2 = c(0:20))                    # Set up variable 2

# 1. the $ construction --------------------------------------------------------

mean(myDF$myVar1)                      # This always works & minimizes errors

# 2. The attach() method -------------------------------------------------------

attach(myDF)                           # Attach the data frame
  mean(myVar2)                         # Do something with the data frame
  myVar2 = myVar2 * 2                  # Here is a transformation
  mean(myVar2)                         # Mean of transformed variable
detach(myDF)                           # Detach the data frame
mean(myDF$myVar2)                      # Transform lost outside of attach

# 3. The data= method ----------------------------------------------------------

lm(myVar1 ~ myVar2, data = myDF)       # Do something with the data frame again

# 4. The with() method ---------------------------------------------------------                  

with(myDF, {                           # Use with() to indicate data frame
  myVar1 = myVar1/2                    # Transform myVar1
  sd(myVar1)                           # Std dev. of transformed myVar1
})                                     # Note close brace & close paren finish
sd(myDF$myVar1)                        # Transform lost outside with()

# 3.7b Displaying the contents of a data frame ============================ 3.7b

myVar1 = c("a", "b", "c")              # Create char variable
myVar2 = c(10, 11, 12)                 # Create numeric variable
myDF = data.frame(myVar1, myVar2)      # Combine into data frame
myDF                                   # Print data frame
names(myDF)                            # Show names of variables in myDF
summary(myDF)                          # Summarize variables in data frame

# 3.8 The basic objects: Lists ============================================= 3.8

# A list generated by the lm model ---------------------------------------------

myVar1 = c(1:8)                        # Set up a y variable
myVar2 = c(3, 5, 4, 6, 7, 9, 2, 9)     # Set up an x variable
myModel = lm(myVar2 ~ myVar1)          # Create a linear model
myModel                                # Print the model output
attributes(myModel)                    # Show the elements in the model output
myModel$residuals                      # Show the residuals from the model

# Double brackets v. Single bracket list indexing ------------------------------

myModel[2]                             # Single bracket index result
myModel[[2]]                           # Double bracket index result

myModel[2][1]                          # Single [] won't open list item
myModel[[2]][1]                        # Double [[]] allows list item access
myModel$residuals[1]                   # $ referencing works the same way

# Create a list ----------------------------------------------------------------

myList = list(1,                       # List starting with model num (1)
  myModel,                             # Then the myModel list 
  "This is a discussion of myModel")   # Then some discussion of myModel

names(myList) = c("ModelNumber",       # Create the list names
  "ModelOutput","ModelDiscussion")
myList                                 # Show myList
myList$ModelOutput$residuals           # Residuals from myModel in myList

# unlist() ---------------------------------------------------------------------

myList = list(c(1,2,3),c("a","b","c"),"It's numbers and letters!")
typeof(myList)                         # Show object type for myList
myList                                 # Print myList
myNotList = unlist(myList)             # New object = unlisted myList
typeof(myNotList)                      # Show type of new unlisted object
myNotList                              # Show my new object

# 3.9 A few things about working with objects ============================== 3.9

# Listing active objects -------------------------------------------------------
objects()                              # Show all active objects

# Removing objects -------------------------------------------------------------
myVector = 1:10                        # Create some objects
myNewVector = myVector+3
myAnimal = "aardvark"
objects()                              # List the objects

rm(myVector)                           # Remove an object
objects()                              # List the objects
rm(list=ls())                          # Remove ALL objects
objects()                              # List the objects
 
# Object overwriting -----------------------------------------------------------

myNumber = 5                           # Assign value 5 to myNumber
myNumber = 7                           # Assign value 7 to myNumber
myNumber                               # Show that 7 replaced 5

myNumber = 5                           # Assign value 5 to myNumber
myNumber = myNumber + 4                # Add 4 to myNumber
myNumber                               # Show new value for myNumber

# 3.10 Object Attributes ================================================== 3.10

# Set attributes for row and column names in a data frame 
myData = data.frame(cbind(             # Create data frame w/2 vectors
  c(1, 0, 1, 1, 0),                    #   Vector 1
  c(24, 38, 22, 51, 17)))              #   Vector 2

# The attr approach
attr(myData, "names") =                # Set col names to identify vars
  c("sex", "age") 
attr(myData, "row.names") =            # Set row names to identify obs
  c("Mary", "Mike", "Mia", "Mish", "Mark")
myData                                 # Display the data frame

attr(myData, "names")=NULL             # Erase attributes

# The rownames/colnames approach
rownames(myData) =                     # Add row names 
  c("Mary", "Mike", "Mia", "Mish", "Mark")
colnames(myData) = c("sex", "age")     # Add column names 

# Using str(0 to show the structure of a data frame ----------------------------
str(myData)                            # Show structure of myData data frame

# 3.11 Objects and environments =========================================== 3.11

# 3.12 Object classes ===================================================== 3.12

x = 5                                  # Create an object
class(x)                               # Show object class
class(mean)                            # Show class of existing R command
class(x) = "my made up class"          # Set new custom class for x
attributes(x)                          # Show attributes of x
str(x)                                 # Show structure of x

# 3.13 The pseudo storage modes =========================================== 3.13

# 3.14 Date and time as storage modes ===================================== 3.14

# 3.15 Factors ============================================================ 3.15

# Figure 3.2 - A graphic showing factor vectors --------------------------------

# This is another exercise in drawing with R. 
# This starts with the custom ovals function from chapter 10 and the
# custom arrows function from chapter 14 already loaded.

png(filename = "illustrations/fig-3-2-R factors.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(0, 0, 0, 0))               # Set margins - no Title
par(usr = c(0, 1, 0, 1))               # Set coordinates to 0,1 space
                   
plot.new()                             # Start a new plot

text(x = .15, y = .97,                 # The vector you see text
  labels = "The vector\nyou see:",
  pos = 1,                             # Place text below and centered
  cex = 1.5,                           # Set font size
  font = 2,                            # Bold
  family = "serif")

text(x = .15, y = .65,                 # What you see vector
    labels='
"Small"\n"Small"\n"Large"\n"Large"\n"Large"\n"Small"\n"Small"',
  pos = 1,                             # Place text below and centered
  cex = 1,                             # Font size
  family = "sans")

lines(
  x = c(.09, .075, .075, .09),         # Add brackets for what you see vector
  y = c(.65, .65, .2, .2),
  col = "black",
  lwd = 2)

lines(
  x = c(.21, .225, .225, .21),
  y = c(.65, .65, .2, .2),
  col = "black",
  lwd = 2)

segments(x0 = .3, x1 = .3,             # Dividing Line
  y0 = .05, y1 = .95,
  lwd = 4,
  col = "gray")

text(x = .6, y = .97,                  # Vectors R Sees text
  labels = "The vectors R sees:",
  pos = 1,                             # Place text below and centered
  cex = 1.5,                           # Font size
  font = 2,                               
  family = "serif")

text(x = .45, y = .89,                 # Levels Vector text
  labels = "The\nLevels\nVector",
  pos = 1,                             # Place text below and centered
  cex = 1,                             # Font size
  font = 2,                            # Bold
  family = "serif")

text(x = .45, y = .625,                # Levels Vector
  labels = 'L1: "L"\nL2: "S"',
  pos = 1,                             # Place text below and centered
  cex = 1,                             # Font size
  family = "sans")

lines(                                 # Add brackets for levels vector
  x = c(.39, .375, .375, .39),
  y = c(.65, .65, .475, .475),
  col = "black",
  lwd = 2)

lines(
  x = c(.51, .525, .525, .51),
  y = c(.65, .65, .475, .475),
  col = "black",
  lwd = 2)
                                       # Pointers vector text
text(x = .6, y = .89,
  labels = "The\nPointers\nVector",
  pos = 1,                             # Place text below and centered
  cex = 1,                             # Font size
  font = 2,                            # Bold
  family = "serif")   

text(x = .6, y = .62,                  # Pointers vector
  labels = "2\n2\n1\n1\n1\n2\n2",
  pos = 1,                             # Place text below and centered
  cex = 1.05,                          # Font size
  family = "sans")

lines(                                 # Add brackets for pointers vector
  x = c(.565, .55, .55, .565),
  y = c(.65, .65, .2, .2),
  col = "black",
  lwd = 2)

lines(
  x = c(.635, .65, .65, .635),
  y = c(.65, .65, .2, .2),
  col = "black",
  lwd = 2)
  
text(x = .75, y = .89,                 # Labels vector text
  labels = "The\nLabels\nVector\n(optional)",
  pos = 1,                             # Place text below and centered
  cex = 1,                             # Font size
  font=2,                              # Bold
  family = "serif")

text(x = .75, y = .625,                # Labels vector
  labels = '"Large"\n"Small"',
  pos = 1,                             # Place text below and centered
  cex = 1,                             # Font size
  family = "sans")

lines(                                 # Add brackets for labels vector
  x = c(.69, .675, .675, .69),
  y = c(.65, .65, .475, .475),
  col = "black",
  lwd = 2)

lines(
  x = c(.81, .825, .825, .81),
  y = c(.65, .65, .475, .475),
  col = "black",
  lwd = 2)

dev.off()                              # Output png file

# Factor levels and labels -----------------------------------------------------

c2ltr = c("FR", "UK", "SW",            # Create a variable with 2 letter
  "NK", "SO")                          #   country abbreviations
cname = c("France", "U.K.", "Sweden",  # Create a variable with
  "North Korea", "Somalia")            #   country names.

country = c("UK", "SW", "FR",          # Create dataset w/country codes
  "SO", "NK")
regime = c(rep("dem", 3),              #   and democratic status
  rep("nondem", 2))
nations.df =                           # Join into a data frame
  data.frame(regime, country)
nations.df                             # Print the new data frame

#now add labels                        
nations.df$country =                   # Create a country factor
  factor(nations.df$country,
    levels = c2ltr,                    # Connecting 2 letter codes
    labels = cname)                    #   with country names

nations.df                             # Print data frame

# Controlling factors with the factor() command --------------------------------
# Set up the factor from figure 3.2
myFactor = factor(c("S", "S", "L", "L", "L", "S", "S"))
str(myFactor)                          # Show factor structure

# Change the ordering of the levels
myFactor = factor(myFactor,            # Use values from current factor
  levels=c("S", "L"))                  # Specify the levels
str(myFactor)                          # Show factor structure

# Add factor labels
myFactor = factor(myFactor,            # Use values from current factor
  labels=c("Small", "Large"))          # Specify labels 
str(myFactor)                          # Show factor structure

# Add additional level ---------------------------------------------------------
myFactor = factor(myFactor,            # Use values from current factor
  levels=c("Small", "Medium",          # Specify the levels w/addition
    "Large"))  
str(myFactor)                          # Show factor structure

levels(myFactor) =                     # Use levels() to add new level
  c(levels(myFactor), "X-Large")       # Combine old levels with new
summary(myFactor)                      # Summarize myFactor w/new levels

# Add new observations
myFactor = factor(c(myFactor, "Medium"),  # Use values from current factor
  levels = c("Small", "Medium", "Large")) # Specify the levels w/addition
str(myFactor)                          # Show factor structure

# Adding observations ----------------------------------------------------------
myFactor2 = c(myFactor, "Medium")      # Can't concatenate w/new values
myFactor2                              # Show result

myFactor2 = c(myFactor, 2)             # Can add to pointer vector
myFactor2                              # But, dumps us out of factor mode
is.factor(myFactor2)

# The data frame approach to adding observations -------------------------------
myDF = data.frame(myFactor)            # Put the factor into a data frame
myDF = rbind(myDF, "Medium", "Small")  # Add 2 new observations
myDF                                   # Show result
myFactor2 = myDF$myFactor              # Return to vector status
myFactor2                              # Confirm result

# Combining levels -------------------------------------------------------------

myFactor2                              # Display the factor
levels(myFactor2)                      # Show the current levels
levels(myFactor2) =                    # Modify the levels to combine
  c("Small", "Large", "Large")         #   Medium with Large.
myFactor2                              # Show the new version

# Ordered and unordered factors ------------------------------------------------

mySize = c("small", "medium",          # Create variable of all sizes
  "large", "X-large")
sort(mySize)                           # Sort (alphabetical default)
mySize = factor(mySize,                # Set as factor
  levels = mySize,                     # Set factor levels from variable
  ordered = T)                         # Make it an ordered factor

sort(mySize)                           # Sort (now ordered)

myData = c("small", "large",           # Here is some data w/sizes
  "small", "X-large", "medium")
sort(myData)                           # Sort (alphabetical default)
myData = factor(myData,                # Make it a factor
  levels = mySize)                     # Use levels from mySize
sort(myData)                           # Sort-now based on factor levels

# 3.16 Coercing storage modes ============================================= 3.16

myVector = c(1, 15, 7, "Smith")        # Set up a vector
typeof(myVector)                       # Show type for vector
myVector[2] + 1                        # Try math w/2nd element in vector
typeof(myVector[2])                    # Type for 2nd element in vector
myVector = as.numeric(myVector)        # Vector forced to numeric
myVector                               # Print vector
typeof(myVector)                       # Type for forced numeric vector

# 3.17 The curse of number-character-factor confusion ===================== 3.17

myData = c(7, 8, "missing", 8)         # Here we simulate the csv input
sum(myData)                            # If we sum myData we get an error
myData[1]                              # We see that the value is a character
myData[1] + 2                          # We get errors with numeric operations
as.numeric(myData[1])                  # Transforming to numeric fixes this
as.numeric(myData[1]) + 2              # Now we can do a numeric operation

# Character variable factors to numbers ----------------------------------------
                                       
animal = (c(rep("kangas", 4),          # Create a character variable
  rep("koalas", 5)))
myData = data.frame(animal)            # Putting the data in a data frame
levels(myData$animal)                  #  converts character to factor
aninum = as.numeric(myData$animal)     # Create a numeric version
myData = cbind(myData,aninum)          # Add that to the data frame
myData                                 # Show results

# The number factor confusion illustrated --------------------------------------
                                       
myVar = (c(rep(7, 4),                  # Create variable w/ "missing"
  "missing", rep(8, 5)))               #  value which forces to character
myData = data.frame(myVar)             #  which --> factor in data frame
levels(myData$myVar)                   # Show levels of unwanted factor 
myVar2 = as.numeric(myData$myVar)      # Convert to numeric
myData = cbind(myData,myVar2)          # Add to data frame
myData                                 # Show data

# Factors to characters --------------------------------------------------------
animal = (c(rep("kangas", 4),          # Create some data
  rep("koalas", 5)))
myData = data.frame(animal)            # Putting the data in a data frame
myData$animal[4]                       # Let's take a look at obs 4
myData$animal[4] = "koalas"            # Now we'll change it to "koalas"
myData$animal[4]                       # Another look at num 4 

# Errors from trying to append new factor levels -------------------------------
myData$animal[4] = "hippopotami"       # When we try to change it to
                                       #   "hippopotami" we get a nasty error

# Solution 1 for the character/factor conundrum -- using the I() function ------
animal = (c(rep("kangas", 4),rep("koalas", 5)))
myData = data.frame(I(animal))         # Putting the data in a data frame
                                       #   but force animal to stay character
typeof(my.data$animal)                 # Just checking on the datatype
myData$animal[4]                       # Take a look at obs 4
myData$animal[4] = "hippopotami"       # Now we can make the change
myData$animal[4]                       # Voila!

# Solution 2 for the character/factor conundrum -- adding another factor level -
animal = (c(rep("kangas", 4),rep("koalas", 5)))
myData = data.frame(animal)            # Putting the data in a data frame
typeof(myData$animal)                  # Checking on our datatype - integer?!
myData$animal = factor(myData$animal,  # We'll add a new level to the mix
  levels=                              #   with the levels option
    c(levels(myData$animal),           # Combining old levels
     "hippopotami"))                   #   with out new entry
myData$animal[4]                       # Let's look at animal[4]
myData$animal[4] = "hippopotami"       # Now we can add "hippopotami" because
myData$animal[4]                       #   that level is included in the factor

# Solution 3 for the character/factor conundrum -- manual conversions ----------
animal = (c(rep("kangas", 4), rep("koalas", 5)))
myData = data.frame(animal)            # Putting the data in a data frame
typeof(myData$animal)                  # Check on the data type
myData$animal =                        # Force the variable back to character
  as.character(myData$animal)
typeof(myData$animal)                  # Recheck the type -- that works!
myData$animal[4]
myData$animal[4] = "hippopotami"       # Now we can make the change to obs 4
myData$animal[4]

R Code for Chapter 4: Getting your Data into R

#===============================================================================
# Chapter 4-- Getting your data into R
#===============================================================================
# 4.1 Entering data ======================================================== 4.1

# 4.1a Entering data with the concatenate function ======================== 4.1a

myVar = c(1, 7, 3, 5, 9, 21, 8)        # Entering data with concatenate
myVar                                  # Print data

myWords = c("ant", "ball", "clown")    # Concatenate words into variable
myWords                                # Print variable

# 4.1b Joining vectors into matrices and data frames ====================== 4.1b

# cbind
temp = c(78.4, 65.3, 72.9, 81.2)       # Setup some data
rain = c(.025, .001, 0, 1.2)
day = c(1, 2, 3, 4)
weather1 = cbind(day, temp, rain)      # cbind demonstration
weather1                               # Show results

# rbind
day1 = c(.025, 78.4)                   # Setup some data
day2 = c(.001, 65.3)
day3 = c(0, 72.9)
day4 = c(1.2, 81.2)
weather2 = rbind(day1, day2,           # rbind demonstration
  day3, day4)  
weather2                               # Show matrix
colnames(weather2) = c("rain", "temp") # Add variable names for columns
weather2                               # Show weather2 matrix w/names

# concatenating different length vectors
temp = c(78.4, 65.3, 72.9, 81.2)       # Setup some data
rain = c(.025, .001, 0, 1.2)
day = c("Saturday", "Sunday")
weekend.weather = data.frame(          # Combine unequal length vectors
  cbind(day, temp, rain))              #   with cbind
weekend.weather                        # Show results


# 4.1c Entering data with the R spreadsheet =============================== 4.1c

#open a spreadsheet and edit a list of data
myX = c(1, 2, 3)                       # Create variable 1
myY = c("a", "b", "c")                 # Create variable 2
myData = cbind(myX, myY)               # Combine in data frame
data.entry(myData)                     # Open in data entry spreadsheet

#open a new data list with a simple dummy value
x = 1
data.entry(x)

#use the edit method
myX = c(1, 2, 3)                       # Create variable 1
myY = c("a", "b", "c")                 # Create variable 2
myData.df = data.frame(myX, myY)       # Combine in data frame
myData.df = edit(myData.df)            # Use edit() to open spreadsheet


# 4.2 Creating data ======================================================== 4.2

# 4.2a Simple sequence and repetitions ==================================== 4.2a

# Colon sequences
myVar1 = 0:10                          # A variable w/ 0-10 sequence
myVar1                                 # Show result

myVar2 = 10:22                         # A variable w/ 10-22 sequence
myVar2                                 # Show result

myVar3 = -5:5                          # A variable w/ -5 to 5 sequence
myVar3                                 # Show result

# The sequence function
myVar4 = seq(0, 5, by = .5)            # A sequence from 0 to 5 by .5
myVar4                                 # Show result

myVar5 = seq(7, 5, -.25)               # A sequence from 7 to 5 by -.25
myVar5                                 # Show result

# The repeat function
myVar1 = rep(7, 3)                     # Create variable w/ 3 7's
myVar1                                 # Show result

myVar2 = c(rep(1, 5), rep(2, 3))       # Create variable w/ 5 1's & 3 2's
myVar2                                 # Show result

myVar3 = c(                            # Create variable with
  rep("Fred", 2),                      #   2 "Fred" and
  rep("Wilma", 4))                     #   4 "Wilma"

myVar3                                 # Show result

# 4.2b Generating factors ================================================= 4.2b

myFactor = gl(n = 3,                   # Set up 3 levels
  k = 2,                               # Each repeats twice
  length = 6,                          # Total length is 6
  labels = c("small", "med", "large")) # Matching labels for 3 levels

myFactor                               # Display factor

myFactor2 = gl(2, 4, 10, c("y", "n"))  # Set 2 levels w/4 repeats
myFactor2                              # Display factor

# 4.2c Random numbers and statistical distributions ======================= 4.2c

# Values from a uniform distribution
myRand = runif(10)                     # Generate 10 random numbers between 0,1
myRand                                 # Print the generated random numbers
round(myRand, 2)                       # Print them again, rounded to 2 places

myRand2 = runif(5, min = 0, max = 100) # Generate 5 random values between 1, 100
round(myRand2, 2)                      # Print them

myRand3 = runif(3, 0, 10)              # Generate 3 random values between 0, 10
round(myRand3, 2)                      # Print them

# Values from a normal distribution:

myNorm = rnorm(10)                     # 10 values from a standard normal dist.
round(myNorm, 2)                       # Print them

myNorm2 =                              # From normal distribution 
  rnorm(10, mean = 50, sd = 10)        #   with mean = 50 & std dev = 10 
round(myNorm2, 2)                      # Print them

myNorm2 = rnorm(10000)                 # Create 10,000 values from normal dist

png(filename = "illustrations/fig-4-1-normal dist histogram.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins - no Title

hist(myNorm2, main = NA)               # Display in histogram

dev.off()                              # Output png file

# 4.3 Importing data ======================================================= 4.3

# 4.3a The working directory ============================================== 4.3a

setwd("C:/data")                       # Set the working directory
getwd()                                # Show the working directory

setwd(dirname(choose.files()))

# 4.4 The read command: Overview =========================================== 4.4

# 4.5 Reading from the clipboard =========================================== 4.5

myData = read.delim("clipboard", header = TRUE)

# 4.6 Reading blank delimited data ========================================= 4.6

myData = read.table("myDataFile.txt", header = FALSE)

# 4.7 Reading comma separated values ======================================= 4.7

myData = read.csv("myDataFile.csv", header = TRUE)

# 4.8 Reading tab separated values ========================================= 4.8

myData = read.delim("myDataFile.txt", header = TRUE)

# 4.9 Fixed width data ===================================================== 4.9

myData = read.fwf(filename, width = c(5, 2, 3, 2, 15, 3, 2))

# Reading multiline data
# We set up two vectors with the relevant width instructions for each line
line1 = c(3, -2, 5, 2, 3, 2, 15)       # We'll skip the line numbers with the -2
line2 = c(-3, -2, 3, 2)                # Skip obs & line numbers on 2nd line
myData = read.fwf("myDataFile.txt", width = list(line1, line2))

# 4.10 Generic tabular data =============================================== 4.10

# Three different kinds of value separators:
myData = read.table("myDataFile.txt", header = TRUE, sep =  ",")
myData = read.table("myDataFile.txt", header = TRUE, sep =  "/t")
myData = read.table("myDataFile.txt", header = TRUE, sep =  "%")

# 4.11 Importing foreign filetypes ======================================== 4.11

library(foreign)
myData = read.dta("c:/data/mydata.dta")

# 4.11a Exporting data in foreign formats ================================ 4.11a

# 4.12 Integrating SQL with R ============================================= 4.12

# 4.13 Extracting data from complex datasources =========================== 4.13

# 4.14 Web scraping ======================================================= 4.14

# 4.15 Dealing with multi-dimensional tables ============================== 4.15

# 4.16 Importing problematic charactters ================================== 4.16

# Strip white example - for removing extra leading or trailing spaces
myData = read.delim("myDataFile.txt",  # Read some data
  sep = "\t",                          # sep = tab 
  strip.white = T,                     # remove spaces
  header = T)                          # data has headers


# 4.17 More resources ===================================================== 4.17

R Code for Chapter 5: Reviewing and Summarizing Data

#===============================================================================
# Chapter 5 -- Reviewing and summarizing data
#===============================================================================

# Set up some synthetic data to use in chapters five and six
# NOTE: This assumes that the stringsAsFactors option is turned on (the default)
# If stringsAsFactors is turned off, you'll want to enclose the animal variable
# in the as.factor() function:  animal = as.factor(c(rep("Mouse",25...)))

ch5data = data.frame(                  # Create a data frame
  year = c(1935:1959,                  # First variable is years
    1940:1949,                         # Sequence with colon 
    1935, 1942, 1958, 1964, 1970),     # Some more years with commas
  animal = c(rep("Mouse", 25),         # Next variable is animal type
    rep("Kangaroo", 10),               # 25 mice, 10 Kangaroos
    rep("Elephant", 5)),               #    5 Elephants
  weight = round(c(                    # Next is plausible weights
    rnorm(25, mean = .06, sd = .006),  #   normally distributed 
    rnorm(10, mean = 100, sd = 14),    #   around a mean 
    rnorm(5, mean = 12000,             #   and std. dev
      sd = 700)), 3),                  #   and rounded to 3 digits
  captive = rep(                       # Finally a captivity status from
    c(TRUE, FALSE, FALSE, FALSE), 10)) #   a repeated TRUE/FALSE pattern

# Some overview ----------------------------------------------------------------
attributes(ch5data)                    # Show data frame attributes
str(ch5data)                           # Show data frame structure

# data frame dimensions
names(ch5data)                         # Show variable names in the data frame
dim(ch5data)                           # Show the dimensions of the data frame
nrow(ch5data)                          # Show the number of rows
dim(ch5data)[1]                        # Show the number of rows
ncol(ch5data)                          # Show the number of columns
dim(ch5data)[2]                        # Show the number of columns

# 5.1 Summarizing Data ===================================================== 5.1

summary(ch5data)                       # Summary statistics for ch5data

# Individual summary statistics
length(ch5data$animals)                # Show num of obs in animals variable
length(ch5data)                        # Show num of variables in ch5data
mean(ch5data$year)                     # Show mean of year
median(ch5data$weight)                 # Show median of weight
sum(ch5data$weight)                    # Show sum of variable weight
max(ch5data$year)                      # Show maximum value of year
min(ch5data$year)                      # Show minimum value of year
range(ch5data$year)                    # Show range of variable x3
sd(ch5data$year)                       # Show standard deviation of year
unique(ch5data$year)                   # Show unique values in year

# missing values ---------------------------------------------------------------

myVar = c(1, 2, 3, 4, NA, 5)           # A variable w/ missing values
mean(myVar)                            # Mean function returns NA
mean(myVar, na.rm = T)                 # Mean without missing values

# 5.2 Sampling  ============================================================ 5.2

# Displaying the top of the dataset
head(ch5data, 7)                       # Display first 7 observations

# Sampling for a simple vector
myVector = c(1:1000)                   # Create a vector
sample(myVector, 5)                    # A random sample of 5 observations

# Now for a data frame

sample5 = sample(1:nrow(ch5data), 5)   # Create a vector of 5 sample values
   
ch5data[sample5,]                      # Print sample of data
ch5data[sample(1:nrow(ch5data), 5),]   # All in one line
                                       
# 5.3 Reviewing Data by Categories     ===================================== 5.3
                                      
table(ch5data$animal, ch5data$captive) # Create animal x captive table
table(ch5data$captive, ch5data$animal) # Create captive x animal table

# The crosstabs approach (see appendix I) --------------------------------------
xtabs(
   ch5data, formula =                  # Use formula =  to set up dimensions
   ~ animal + captive)

my.xtab = xtabs(                       # Save the crosstab for later analysis
   ch5data,formula = 
   ~ animal + captive)

summary(my.xtab)                       # Generate Chi-Square statistic for x-tab

# Using the split() function ---------------------------------------------------

lapply(split(ch5data, ch5data$captive), summary)
lapply(split(ch5data, ch5data$animal), summary)
lapply(split(ch5data, as.list(ch5data$captive, ch5data$animal)), summary)
lapply(split(ch5data, ch5data$captive), summary)

lapply(split(ch5data$weight, ch5data$animal), mean)
lapply(split(ch5data, ch5data$animal), median, na.rm = T)

# The cut() function -----------------------------------------------------------
lapply(                                # Apply a function to a data frame
  split(ch5data,                       # Split up data before applying 
    cut(ch5data$weight,                # Variable to use as factor
      breaks = c(0, 50, 500, 100000),  # Break points (including ends)
      labels =                         # Labels for the different levels
        c("small", "large", "huge"))),   
  summary)                             # Function to apply


# 5.4 Histograms =========================================================== 5.4

myVar = seq(-3, 3, .0001)              # Set up a variable with .0001 intervals
myNormVar = rnorm(myVar)               # Create random normal variable on myVar

png(filename = "illustrations/fig-5-1-histogram.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins - no Title

hist(myNormVar, main = NA)             # Histogram plot of normal variable
                                       #  with title (main = ) turned off
dev.off()
                                       # Output png plot
# Kernel Density Plot

png(filename = "illustrations/fig-5-2-kernel density.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins - no Title
                   
plot(density(myNormVar), main = NA)    # Density plot of normal variable
polygon(density(myNormVar), 
  col = "gray")                        # Fill it in with gray if you want

dev.off()                              # Output png file

# 5.5 Scatterplots                     ===================================== 5.5

myX = c(1, 5, 19, 7, 6, 18, 11, 10)    # Create an x variable
myY = c(4, 3, 12, 7, 8, 9, 15, 9)      # Create a y variable

png(filename = "illustrations/fig-5-3-basic scatterplot.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins - no Title

plot(myX, myY)                         # Simple scatterplot of x & y

dev.off()                              # Output png file

# adding labels

myLabels = c(                          # Add a vector of labels for each obs
  "Bach", "Beethoven",
  "Brahms", "Mozart", "Chopin",
  "Tchaikovsky", "Satie", "Bartok")

png(filename = "illustrations/fig-5-4-scatterplot labels.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins - no Title

plot(myX, myY)                         # Simple scatterplot of x & y
text(myX, myY,                         # Add labels using x & y coords
  labels = myLabels,                   # The labels to add
  pos = 3,                             # Put labels above points
  xpd = TRUE)                          # Allow printing outside plot

dev.off()                              # Output png file

# Only add labels to more extreme points (not in book)

myData = data.frame(myX, myY, myLabels)# Package data in data frame
plot(myData$myX, myData$myY)           # Scatterplot of myX & myY

myData2 = myData[                      # Select cases where 
  abs(myData$myY - mean(myData$myY)) > #   deviation of Y from mean >
  sd(myData$myY) &                     #   1 std dev and where
  abs((myData$myY - mean(myData$myY))/ #   standardized deviation of Y
  sd(myData$myY)) > 2 *                #   is greater than 2 times the    
  abs((myData$myX - mean(myData$myY))/ #   standardized deviation of X
  sd(myData$myX)),]

text(myData2$myX, myData2$myY,         # Use coordinates of extreme points
  myData2$myLabels,                    #   to place labels
  pos = 4)                             #   to the right of the points

# 5.6 Pairs Plots ========================================================== 5.6

myZ = c(3, 9, 12, 2, 2, 17, 1, 8)      # Add z variable to x & y from sect 5.5
myData = data.frame(myX, myY, myZ)     # Combine x,y,z into data frame 

png(filename = "illustrations/fig-5-5-pairs plot.png",
  units = "in",                        # Set measurements in inches
  res = 1200,                          # Set resolution at 1200dpi
  width = 6,                           # Width at 6 inches
  height = 4)                          # Height at 4 inches

par(mai = c(1, 1, .25, .25))           # Set margins - no Title

pairs(myData)                          # Pairs plot for data frame

dev.off()                              # Output png file

R Code for Chapter 6: Sorting and Selecting Data

#===============================================================================
# Chapter 6 -- Sorting and selecting data
#===============================================================================

# We'll use the same data we set up at the beginning of chapter 5 throughout
# this chapter

# 6.1 Using index values for selection ===================================== 6.1

selection = c(1, 5, 7, 11:14)          # Set a vector of rows to select
myDF = ch5data[selection,]             # Create data frame w/selected rows
myDF = ch5data[c(1, 5, 7, 11:14),]     # Create data frame w/selected rows

myDF                                   # Print myDF
myDF[-2,]                              # Print myDF w/o observation 2

myDF2 = myDF[,-c(1, 3)]                # Drop columns 1 & 3

myDF4 = myDF[-c(1, 3, 7),]             # Drop rows 1,3 & 7

# 6.2 Selecting with conditional values===================================== 6.2

myDF = ch5data                         # A shorter DF name for convenience
kDF = myDF[myDF$animal == "Kangaroo",] # Select all kangaroos
head(kDF,3)                            # First 3 kangaroo observations

bigkDF =                               # New data frame for big kangaroos
  myDF[myDF$animal == "Kangaroo" &     # Select all kangaroos
  myDF$weight>100,]                    #   weighing more than 100 lbs

# The which() approach ---------------------------------------------------------
kselect = which(myDF$weight>100 &      # Create a vector of obs numbers
  myDF$animal == "Kangaroo")           #   with Kangaroos > 165 lbs
kselect                                # Print selection vector
bigkDF2 = myDF[kselect,]               # Create data frame w/selected
bigkDF2                                # Print data frame

# 6.3 Using subset() ======================================================= 6.3

myData = data.frame(
  myV1 = c(10, 20, 30, 40),            # Set up 3 variables in a data frame
  myV2 = c("a", "b", "c", "d"),
  myV3 = c(1, 2, 3, 4))
myData                                 # Display the data frame

# Subset by rows
myData1 = subset(myData, myV1>20)      # Select rows with myV1>20
myData1                                # Display results
myData2 = myData[myV1>20,]             # Same selection with bracket method
myData2                                # Display results
                                                                  
# Subset by columns                     
myData3 = subset(myData,               # Create a data subset
  select = myV1)                       #    with just myV1
myData3                                # Display the result
myData4 = subset(myData,               # Create a data subset
  select = -myV3)                      #   without myV3
myData4                                # Display the result

myData3b = myData[,1]                  # Same result w/bracket approach
myData3b                               # Display result
myData4b = myData[,-3]                 # Same result w/bracket approach
myData4b                               # Display result
myData3c = myData[,"myV1"]             # Using variable name to select col
myData3c                               # Display result
myData4c = myData[,c("myV1", "myV2")]  # Use variable name for col selection
myData4c                               # Display result


# Subsetting with a vector
myData5 = subset(                      # Create a subset with just
  myData, select = c(myV1, myV3))      #   myV1 and myV3 
myData5                                # Display the result

myData6 = subset(                      # Create a subset without
  myData, select = -c(myV2, myV3))     #   myV2 and myV3
myData6                                # Display the result

mySelector = c("myV1", "myV3")         # Create vector to select variables
myData7 = subset(                      # Subset the data using
  myData, select = mySelector)         #   the mySelector vector
myData7                                # Display the result

# 6.4 Splitting data into groups with by() ================================= 6.4

myDF = ch5data                         # Make sure the data is reset

summary(myDF)                          # Summary of the whole ch 5 dataset
by(myDF, myDF$animal, summary)         # Summary by animal type
by(myDF,                               # 
  interaction(myDF$animal,             # Interact two factors:  animal type
    myDF$captive),                     #   and captivity
  summary)                             # Apply the summary function


#6.5 Splitting up continuous numeric data ================================== 6.5

# Splitting data with cut() ----------------------------------------------------
myVar = rnorm(100)                     # Create var from std. normal dist
myVar2 = cut(myVar, -3:3)              # Cut into 6 1 unit segments
table(myVar2)                          # Show number of obs in each segment

# Supress labels ---------------------------------------------------------------
myVar2 = cut(myVar, -3:3,              # Split up myVar 
  labels = FALSE)                      #   without labels
table(myVar2)                          # Show number of obs in each segment

# Using quantile() -------------------------------------------------------------
myVar = rnorm(n = 100)                 # Set up data drawn from normal dist
quantile(myVar)                        # Default quantiles is quartiles
quantile(myVar,                        # Set up quintiles manually
  probs = (c(0, .2, .4, .6, .8, 1))) 
quantile(myVar,                        # Set up quintiles with seq()
  probs = seq(0, 1, by = .2))  
quantile(myVar,                        # Some custom cuts for normal curve
  probs = c(.0001, .01, .025, .5, .975, .99, .9999))

# cut and quantile together ----------------------------------------------------
myVar = rnorm(n = 100)                 # Set up data drawn from normal dist
myVarQ = cut(myVar,                    # Divide myVar into bins
  quantile(myVar),                     # Use 4 bins w/equal num of obs (default)
  include.lowest = T)                  # Include lowest value in 1st bin
   
summary(myVarQ)                        # Show quartiles
head(cbind(myVar, myVarQ))             # Show first 6 observations

myVarQ2 = cut(myVar,                   # Divide myVar into bins
  quantile(myVar,                      # Use 5 bins w/equal num of obs
    prob = seq(0, 1, by = .2)),  
  include.lowest = T)                  # Include lowest value in 1st bin
   
summary(myVarQ2)                       # Show quintiles
head(cbind(myVar, myVarQ2))            # Show first 6 observations


# 6.6 Sorting and Ordering Data ============================================ 6.6

# 6.6a Sorting a variable ================================================= 6.6a
# Sorting example --------------------------------------------------------------

numbers = c(1, 6, 5, 7, 8)             # Some numbers to sort
sort(numbers)                          # Voila!  the numbers sorted

# Ordering example -------------------------------------------------------------
fruit = c("Apple", "banana",           # Some fruit data
  "apple", "Banana")
fruit = fruit[order(fruit)]            # Ordered by fruit
fruit                                  # Display in order


# 6.6b Ordering a data frame ============================================== 6.6b

# Show the difference between order() and sort() -------------------------------
numbers = c(1, 6, 5, 7, 8)             # Some numbers to sort
sort(numbers)                          # Sorted --> order of numbers
order(numbers)                         # Ordered --> obs nums of sorted obs

# A data frame ordering example ------------------------------------------------

myDF2 = subset(ch5data,                # For tractibility in example
  year>1940 & year<1945)               #   take subset of myDF
myDF2 = myDF2[order(myDF2$year),]      # Sort myDF2 by year
myDF2                                  # Display result

# Ordering by two variables ----------------------------------------------------

myDF2 = myDF2[order(myDF2$year,        # Order first by year
  myDF2$animal),]                      #   and then by animal name
myDF2                                  # Show result