# Survey Data Analysis in R: Basic Introduction

What is R?

R is an object-oriented programming language that is designed for data analysis. To do with R programming, a person must have knowledge of programming, however, a person naive to programming can get on with R.

Some few concepts of object-oriented programming language:

1. Class: Class is a blueprint of an object. The class describes the content of the object that belongs to it. The class defines the overall data structure.

2. Object: Object can be anything from numbers to characters to DateTime. The nature of an object is defined by its class. The objects have the behavior of their class.

3. Polymorphism: Polymorphism is the ability of an object to take on many forms. The most common use of polymorphism in OOP occurs when a parent class reference is used to refer to a child class object (Source: Tutorialspoint.com).

4. Inheritance: Inheritance is a process of defining a new class based on an existing class by extending its common data members and methods.

R programming is not as complex as Java programming or C++. We use packages in R to make our work simple.

Data types in R

1. Scalar: Scalar is a number or a character.

2. Vector: Vector is a combination of numbers or characters.

3. Dataframe: Dataframe is the combination of vectors.

4. Matrix: A matrix is a rectangular array of numbers (or other mathematical objects) for which operations such as addition and multiplication are defined.

5. List: The list is the object which contains elements of different types – like strings, numbers, vectors, and another list inside it.

Defining Object in R

Everything in R can be an object. We shall give a name to an object and assign value to it. The value of an object can be a number, a vector, a matrix, a dataframe, a list, ...

object_name <- 4 #It is an example of Scalar

assign('object_name',4)

Here, 4 is assigned to an object named 'object_name'.

object_name1 = c(4,5,6) #It is an example of Vector

Here, c = combine; and 4,5,6 are assigned to object named 'object_name1'.

<- and = work same but <- (alt + -) is used frequently.

Setting working directory in R

getwd() # see the working directory

#setwd("directory_path")

#For example:

setwd("D:/~~~SPSS session/Materials") #replace \ with /.

Survey Data Analysis in R begins...

#show working directory

getwd()

#setting working directory

setwd("D:/~~~SPSS session/Materials")

#importing library

#installing package

#If not installed intall it by removing '#' in line 8.

#install.packages("readxl") #if not installed install it.

#Use library

#import excel dataset

sudal_dataset <- read_excel("2077.06.18 SPSS dataset sudal.xlsx", sheet = "Data")

#Note:

#1. sudal_dataset is an object and this object contains the imported data

#2. read_excel("name of datafile.xlsx", sheet="Name of sheet")

#view dataset

View(sudal_dataset)

#display variable name

names(sudal_dataset)

#show number of variables

length(sudal_dataset)

#show number of observations of variable Household

length(sudal_dataset$SN) #Number of variables and observations dim(sudal_dataset) rownames(sudal_dataset) #Show variable names colnames(sudal_dataset) #display first 6 observations head(sudal_dataset) #show first 20 observations head(sudal_dataset, n=20) #Data structure str(sudal_dataset) View(sudal_dataset$Household)

#dealing with missing values

is.na(sudal_dataset)

fix(sudal_dataset)

sum(is.na(sudal_dataset))

#recode values of variable gender such that "Female = 0" instead of "Female =2"

#command

#dataset$variable_name[dataset$variable_name==old_value] <- new_value

sudal_dataset$Gender[sudal_dataset$Gender==2] <- 0

#there is missing values

na.omit(sudal_dataset)

#labeling variable values

#labeling variable Gender

#command

#dataset$variable_name <- factor(dataset$variable_name,levels=c(level1,level2,...),labels=c("name1","name2",...))

sudal_dataset$Gender <- factor(sudal_dataset$Gender,levels = c(0,1),labels = c("Female","Male"))
summary(sudal_dataset$Gender) #labeling variable Area sudal_dataset$Area <- factor(sudal_dataset$Area,levels = c(1,2),labels = c("Sudal","Koteshwor")) summary(sudal_dataset$Area)

#labeling variable Education

sudal_dataset$Education <- factor(sudal_dataset$Education,levels = c(1,2,3,4,5,6),labels = c("Never attended school","Attended school","SLC","Intermediate","Bachelors","Masters"))
summary(sudal_dataset$Education) #labeling variable Employment sudal_dataset$Employment <- factor(sudal_dataset$Employment,levels = c(1,2),labels = c("Employed","Unemployed")) summary(sudal_dataset$Employment)

#label variable family type

sudal_dataset$Family type <- factor(sudal_dataset$Family type,levels = c(1,2),labels = c("Nuclear","Joint"))
summary(sudal_dataset$Family type) #label variable ethnicity sudal_dataset$Ethnicity <- factor(sudal_dataset$Ethnicity,levels = c(1,2,3,4,5,6,7,8,9,10),labels = c("Kami","Brahmin","Newar","Chettri","Sunuwar","Magar","Thing","Tharu","Madhesi","Tamang")) summary(sudal_dataset$Ethnicity)

#recode variable sector "4" as "3"

#Command

#dataset$variable_name[dataset$variable_name==old_value] <- new_value

sudal_dataset$Sector[sudal_dataset$Sector==4] <- 3

#label variable Sector

sudal_dataset$Sector <- factor(sudal_dataset$Sector,levels = c(1,2,3,4),labels = c("Agriculture","Business","Service","Foreign Employment"))

#label variable Occupation

sudal_dataset$Occupation <- factor(sudal_dataset$Occupation,levels = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),labels = c("Tailor","Farmer","Blacksmith","Police","Guard","Self-owned business","Beautician","Teacher","Driver","Cook","Training Center","Staff","Foreign Employment","Labour","Government Servent"))

#Creating tables

mytable <- table(sudal_dataset$Gender,sudal_dataset$Ethnicity)

mytable #print table

margin.table(mytable,1) # A frequencies summed over B

margin.table(mytable,2) # B frequencies summed over A

prop.table(mytable) # Cell percentage

prop.table(mytable,1) # Row percentage

prop.table(mytable,2) # Column percentage

#Save table

write.table(mytable,file = "table_1.csv",sep = ",",quote = FALSE,row.names = T)

#We use attach command to attach database in 'R'

attach(sudal_dataset)

mytable1 <- table(Gender,Ethnicity)

mytable1

write.table(mytable1,file = "table_2.csv",sep = ",",quote = FALSE,row.names = T,col.names = T)

#Creating charts

#box plot

#command

#boxplot(variable_name(numerical)~variable_name(categorical),main="Chart Title")

boxplot(Food_today,main="Box plot")

boxplot(Food_today~Employment,main="Box plot of Food expenses by Gender")

#Scatter plot

#Command

#plot(variable_name1~variable_name2)

plot(Food_today~Total Income)

#Using ggplot2 for attractive graphs

install.packages("ggplot2")

install.packages("ggthemes")

library(ggplot2)

library(ggthemes)

#box plot

boxchart <- ggplot(sudal_dataset,aes(x=Gender,y=Food_today,fill=Employment))+

geom_boxplot(outlier.colour="red", outlier.shape=8,outlier.size=4)+

stat_boxplot(geom = 'errorbar')

boxchart_final <- boxchart+ggtitle("Box plot of Food expenditure by Gender clustered by Employment")+

xlab("Gender")+ylab("Food expenses")

boxchart_final

#Flip the box plot

boxchart_final+coord_flip()

#Bar chart

barchart <- ggplot(sudal_dataset,aes(x=Gender,y=Food_today,fill=Area))+

geom_bar(stat = "identity")

barchart

#Pie chart

pie <- ggplot(sudal_dataset,aes(x=Food_today,y=Gender,color=Gender))+

geom_bar(stat = "identity",width = 1)+

coord_polar("y",start=0)

pie

#Scatter plot

scatterplot <- ggplot(sudal_dataset,aes(x=Food_today/10^6,y=Total Income/10^6,color=Gender,linetype=Gender))+

geom_point(size=2.5)+

geom_smooth(method = lm,se=F)+

facet_wrap(~Employment,nrow=2)+

scale_x_continuous(trans = "log10")+

scale_y_continuous(trans = "log10")+

ggtitle("Food expenditure and Total income by Gender grouped by Employment")+

xlab("Food expenditure (Rs.) (log(10))")+

ylab("Total income (Rs.) (log(10))")+

theme_economist()

scatterplot

#Save scatterplot in working directory

jpeg("scatterplot.jpg",width = 650,height = 450)

scatterplot

dev.off()

To be continued...