STATA is a very powerful, but not an open-source, data analysis software.
// Note: commands in // or * are comments. The commands in quotes are Stata Syntax.
//By Mr. Rohan Byanjankar
Supportive materials
Excel file: Download
Do file: Download
//Easily copiable Do file Syntax
PDF file with Syntax: Download
//clearclear
//importing data *syntax
*import excel "",sheet("Data") firstrow
import excel "D:\~~~SPSS session\Materials\Files\2077.06.18 SPSS dataset sudal.xlsx", sheet("Data") firstrow
//browse data browse in 1/20
//editing data edit
//labeling or describing variable label variable Household "Household ID"
//renaming variable rename Membersinfamily hhsize
// generating new variable
*syntax: gen new_var = operation to be performed
gen new_var=0
//dropping variable drop new_var
//count
count if Age>=40
//sort dataset in ascending order sort Age
//labeling of variables label define Gender 1"Male" 2"Female"
label values Gender Gender
//label list label list Gender
//Labeling for Religion
label define Religion 1"Hindiusm" 2"Kirat" 3"Buddhist"
label values Religion Religion
label list Religion
//Labeling for family type
label define Familytype 1"Nuclear" 2"Joint"
label values Familytype Familytype
//Labeling for Education
label def Education 1"Never Attended School" 2"Attended School" ///
3"SLC" 4"Intermediate" 5"Bachelors" 6"Masters"
label values Education Education
//labeling for Area
label def Area 1"Sudal" 2"Koteshwor"
label values Area Area
//Same process of labeling other variables
//Recoding variable Age
//syntax
*recode <variable_name> <(range=code "label")>,gen<new_variable_name>
recode Age (min/20=1 "Below 20") (20/30=2 "20-30") ///
(30/40=3 "30-40")(40/50=4 "40-50")(50/60=5 "50-60") ///
(60/max=6 "60+"),gen(age_group)
list Age age_group
label var age_group "Grouping of Age"
//Producing tables tab Gender
tab Gender,missing //if there is missing values
tab Occupation
tab Occupation,missing
tab Religion //display with label name
//I want codes, not label
tab Religion, nol
tab Gender,nol //do not display labels
tab Education
//all one-way tables
tab1 var_1 var_2 var_n
tab1 Gender Education Familytype
//two way tables
**command tab tab Gender Education
tab Gender Area
//all two-way tables
tab2 Gender Area Gender Education
//two-way table with row percent
tab Gender Education, row
tab Gender Area, row
//two-way table with column percent
tab Gender Education, col
tab Gender Area, col
//two-way table with row percent but no frequency
tab Gender Education, row nofreq
tab Gender Area, row nofreq
//two-way table with column percent but no frequency
tab Gender Education, col nofreq
tab Gender Area, col nofreq
//graph //bar chart
graph bar,over(Area) ///
blabel(bar,position(outside) size(11pt)) ///
ylabel(none) ytitle("Percentage") yscale(r(0,70)) ///
title("Percentage of respondents by Area",margin(b=4)) ///
caption("Source: Author's own data") ///
asyvars
graph pie,over(Area)
//box plot
graph box Food_today,over(Area) ///
ylabel(none) ytitle("Food expenses in Rs.") ///
title("Box plot of Food expenses in Sudal and Koteshwor",margin(b=4)) ///
asyvars ///
box(1,color(red)) ///
box(2,color(navy)) ///
legend(stack size(small) ///
title(Area of respondents,size(medium)))
graph export boxplot.png,replace
//summary statistics sum Food_today,detail
//correlation
*syntax corr var_list
corr Food_today Food_10_years_ago
// lets try another command
*syntax pwcorr var_list
//normal correlation
pwcorr Food_today Food_10_years_ago
//correlation with a significance level
pwcorr Food_today Food_10_years_ago,sig
//correlation with significance level in a star
pwcorr Food_today Food_10_years_ago,star(0.05) //show star if sig.<0 .05="" code="">0>
//hypothesis test
//null hypothesis: Hypothesis of no difference ttest Food_today=10000
//one sample t-test by group
by Area,sort:ttest Food_today == 10000
//independent sample t test //null: there is no difference between food expenses in sudal and koteshwor.
//numerical variable and categorical
*syntax: ttest numercial_var,by(categorical variable)
//equal variance
ttest Food_today,by(Area)
//unequal variance
ttest Food_today,by(Area) unequal
//paired t test
//both variables numerical //null/claim: Food expenses today=Food expenses 10 year ago
ttest Food_today==Food_10_years_ago
//z-test //z-test valid for large sample but t-test valid for both large and small samples.
//null: food expenses=10000
//p<0.05 we reject our claim.
ztest Food_today=10000
//independent sample z test //null: there is no difference between food expenses in sudal and koteshwor.
//numerical variable and categorical
*syntax: ttest numercial_var,by(categorical variable)
ztest Food_today,by(Area)
//ANOVA //extension of independent sample t test
//one numerical and other categorical with more than two categories
//in case of t test
//one numerical and other categorical with only two categories
*syntax anova num_var cat_var
//null: there is no difference between food expenses among religious groups
//p<0.05 we reject our claim.
anova Food_today Religion
//Chi2
//when to use it?
//when both variables are categorical
//null: there is no association between Gender and Area
tab Gender Area,chi2
//for expected count
tab Gender Area,chi2 expected
//if the expected count is less than 5, then we cannot use Pearson chi2. So we need to move to Fischer's exact test. //for Fischer test
tab Gender Area,chi2 expected exact
//for log-likelihood
tab Gender Area,chi2 expected exact lrchi2
//regression
//multiple linear regresion
*syntax reg num_dep_var ind_vars
//generating new variable
*syntax gen new_var_name = ln(old_var_name) //in case of log transformation
gen ln_food_today=ln(Food_today)
gen ln_avg_income=ln(AverageIncome)
//regression begins
reg ln_food_today i.Area i.Gender Age ln_avg_income
//result interpretation //area: The food expenses in Koteshwor are 65 percent higher than the food expenses in Sudal.
//Income: If income increases by 10 percent, then food expenses increase by 1.9 percent.
//test for multicollinearity and heteroskedasticity
//for multicollinearity and heteroskedasticity
vif
//rule: if vif is less than 10, the model is free from multicollinarity
//for heteroskedasticity
estat hettest
//for autocorrelation only for time series data
estat dwatson
//logistic regression
*syntax logit cat_dep_var ind_vars
gen gender1=0
replace gender1=1 if Gender==1
//logit coefficients//logit coefficients are not intrepretable as the coefficients of OLS. These coefficients only show the nature of relationship between the dependent and independent variable.
logit gender1 ln_food_today ln_avg_income i.Area i.Familytype
//odds ratio//It is the odds ratio that matters the most. An odds ratio is the ratio of the probability of happening an event by the probability of not happening an event. If an odds ratio is greater than 1, then the probability of happening an event is greater. Suppose, an odds ratio of "landless" to poverty in logit regression is 1.2, then it means that the probability of being poor increases by 20 percent [(1.2-1)*100] for "landless" households compared to houesholds with "land"..
//we will use outreg2
ssc install outreg2
logit gender1 ln_food_today ln_avg_income i.Area i.Familytype,or
//Specification tests for logit model
//test: mispecification test
ssc install linktest,replace //install linktest in stata
linktest
//goodness of fit
estat gof
//fitstat
fitstat
//heteroskedasticity and multicollinearity
reg gender1 ln_food_today ln_avg_income i.Area i.Familytype
estat hettest
vif
//logit coefficients
logit gender1 ln_food_today ln_avg_income i.Area i.Familytype
//odds ratio
logit gender1 ln_food_today ln_avg_income i.Area i.Familytype,or
//for marginal effect
mfx,force
//export work from stata to word
//installing package
//ssc install reg ln_food_today i.Area i.Gender Age ln_avg_income
outreg2 using word.doc,label
//logit coefficients
logit gender1 ln_food_today ln_avg_income i.Area i.Familytype
outreg2 using word1.doc,label ctitle(Logit Coeff) replace
//odds ratio
logit gender1 ln_food_today ln_avg_income i.Area i.Familytype,or
outreg2 using word1.doc,label ctitle(Odds ratio) append eform
//for the marginal effect
mfx,force
outreg2 using word1.doc,label ctitle(mfx) append mfx
0 Comments