We have these files: - cholest.csv - cholest.sav - cholest.dta - cholest.xlsx
*Always make sure that you set the working directory first!
data.csv = read.csv("cholest.csv") #most natural way to open data in R
data.csv
library(foreign) #library to read .sav (SPSS) and .dta (STATA) files
data.sav = read.spss("cholest.sav", to.data.frame = TRUE) #SPSS
data.sav
data.dta = read.dta("cholest.dta") #STATA
data.dta
library(readxl) #library to read excel files, must install first
data.xls = read_excel("cholest.xlsx", sheet = 1)
data.xls
str(data.csv) #Basic info
## 'data.frame': 80 obs. of 5 variables:
## $ chol : num 6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
## $ age : int 38 35 39 36 31 38 33 36 40 34 ...
## $ exercise: int 6 5 6 5 4 4 5 5 4 6 ...
## $ sex : int 1 1 1 1 1 1 1 1 1 1 ...
## $ categ : int 0 0 0 0 0 0 0 0 0 0 ...
dim(data.csv) #Dimension (row/case column/variable)
## [1] 80 5
names(data.csv) #Variable names
## [1] "chol" "age" "exercise" "sex" "categ"
head(data.csv) #View data, first 6 rows
tail(data.csv) #View data, last 6 rows
data.csv #View all
View(data.csv) # View in a new tab instead of console
data.csv$age #View "age" only
## [1] 38 35 39 36 31 38 33 36 40 34 38 40 40 28 37 38 49 29 40 38 34 46 42 38 32 43 42 40
## [29] 38 39 39 39 35 38 40 38 45 36 31 34 44 35 40 37 33 46 42 40 45 42 45 38 34 44 39 38
## [57] 39 47 41 44 30 48 47 42 42 49 31 38 38 48 34 45 45 36 45 52 35 43 47 44
data.csv["age"]
data.csv[2]
In general, syntax
data[row(number/name), col(number/name)]
data.csv[1:10, 2:4] #Row 1 to 10; col 2 to 4
data.csv[c(1,3,5,7,9), c("age", "chol")] #Row 1,3,5,7,9; col age & chol
data.csv[data.csv["age"] == 38, c("age", "chol")] #Row age = 38; col age & chol
data.csv[data.csv["sex"] == 1, c("sex", "chol")] #Row Sex = 1; col sex & chol
Can also use subset()
, syntax
subset(data, condition, variable)
subset(data.csv, age == 38)
subset(data.csv, age == 38, age:sex)
We use data.sav
, with category labels
str(data.sav) #numerical = num, categorical = Factor
## 'data.frame': 80 obs. of 5 variables:
## $ chol : num 6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
## $ age : num 38 35 39 36 31 38 33 36 40 34 ...
## $ exercise: num 6 5 6 5 4 4 5 5 4 6 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
## $ categ : Factor w/ 3 levels "Grp A","Grp B",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "variable.labels")= Named chr [1:5] "cholesterol in mmol/L" "age in year" "duration of exercise (hours/week)" "" ...
## ..- attr(*, "names")= chr [1:5] "chol" "age" "exercise" "sex" ...
## - attr(*, "codepage")= int 65001
summary(data.sav)
## chol age exercise sex categ
## Min. : 6.50 Min. :28.00 Min. :2.000 female:40 Grp A:25
## 1st Qu.: 7.60 1st Qu.:36.00 1st Qu.:4.000 male :40 Grp B:33
## Median : 8.30 Median :39.00 Median :4.000 Grp C:22
## Mean : 8.23 Mean :39.48 Mean :4.225
## 3rd Qu.: 8.80 3rd Qu.:43.25 3rd Qu.:5.000
## Max. :10.00 Max. :52.00 Max. :6.000
library(psych) #to use describe
describe(data.sav[c("chol","age", "exercise")])
table(data.sav$sex)
##
## female male
## 40 40
table(data.sav$categ)
##
## Grp A Grp B Grp C
## 25 33 22
hist(data.sav$chol)
boxplot(data.sav$chol)
plot(data.sav$age, data.sav$chol)
abline(lm(chol ~ age, data = data.sav)) #need two lines of codes
count = table(data.sav$sex)
barplot(count, col = c("blue", "red"))
Exploring Data Using R
Book: https://penerbit.usm.my/index.php/buku/425-exploring-data-using-r