.csv (Spreadsheet/Excel)
data = read.csv("cholest.csv")
.sav (SPSS) and .dta (STATA)
Require foreign package:
library(foreign)
Read SPSS file, .sav,
data = read.spss("cholest.sav", to.data.frame = TRUE)
Read STATA files, .dta,
data = read.dta("cholest.dta")
Let’s continue using cholest.sav dataset,
data = read.spss("cholest.sav", to.data.frame = TRUE)
View full data by,
data
#> chol age exercise sex categ
#> 1 6.5 38 6 male Grp A
#> 2 6.6 35 5 male Grp A
#> 3 6.8 39 6 male Grp A
#> 4 6.8 36 5 male Grp A
#> 5 6.9 31 4 male Grp A
#> 6 7.0 38 4 male Grp A
#> 7 7.0 33 5 male Grp A
#> 8 7.2 36 5 male Grp A
#> 9 7.2 40 4 male Grp A
#> 10 7.2 34 6 male Grp A
#> 11 7.3 38 6 male Grp A
#> 12 7.3 40 5 male Grp A
#> 13 7.3 40 4 male Grp A
#> 14 7.3 28 5 male Grp A
#> 15 7.3 37 5 male Grp A
#> 16 7.4 38 4 male Grp A
#> 17 7.4 49 5 male Grp A
#> 18 7.4 29 5 male Grp A
#> 19 7.5 40 3 male Grp A
#> 20 7.6 38 5 male Grp A
#> 21 7.6 34 5 male Grp A
#> 22 7.6 46 4 male Grp A
#> 23 7.6 42 5 male Grp A
#> 24 7.6 38 4 male Grp A
#> 25 7.8 32 5 male Grp A
#> 26 7.8 43 4 male Grp B
#> 27 7.8 42 5 male Grp B
#> 28 7.8 40 5 male Grp B
#> 29 7.8 38 5 male Grp B
#> 30 7.9 39 5 male Grp B
#> 31 7.9 39 5 male Grp B
#> 32 7.9 39 5 male Grp B
#> 33 8.0 35 3 male Grp B
#> 34 8.0 38 4 male Grp B
#> 35 8.1 40 5 male Grp B
#> 36 8.1 38 4 male Grp B
#> 37 8.2 45 6 male Grp B
#> 38 8.2 36 4 male Grp B
#> 39 8.3 31 4 male Grp B
#> 40 8.3 34 5 male Grp B
#> 41 8.3 44 4 female Grp B
#> 42 8.3 35 5 female Grp B
#> 43 8.4 40 4 female Grp B
#> 44 8.4 37 6 female Grp B
#> 45 8.5 33 4 female Grp B
#> 46 8.5 46 4 female Grp B
#> 47 8.5 42 5 female Grp B
#> 48 8.5 40 4 female Grp B
#> 49 8.5 45 4 female Grp B
#> 50 8.5 42 5 female Grp B
#> 51 8.5 45 4 female Grp B
#> 52 8.6 38 5 female Grp B
#> 53 8.6 34 3 female Grp B
#> 54 8.6 44 4 female Grp B
#> 55 8.7 39 3 female Grp B
#> 56 8.7 38 4 female Grp B
#> 57 8.7 39 3 female Grp B
#> 58 8.8 47 3 female Grp B
#> 59 8.8 41 4 female Grp C
#> 60 8.8 44 4 female Grp C
#> 61 8.8 30 3 female Grp C
#> 62 8.9 48 3 female Grp C
#> 63 8.9 47 4 female Grp C
#> 64 8.9 42 4 female Grp C
#> 65 9.0 42 4 female Grp C
#> 66 9.0 49 3 female Grp C
#> 67 9.1 31 2 female Grp C
#> 68 9.2 38 3 female Grp C
#> 69 9.2 38 3 female Grp C
#> 70 9.3 48 3 female Grp C
#> 71 9.3 34 4 female Grp C
#> 72 9.3 45 3 female Grp C
#> 73 9.4 45 3 female Grp C
#> 74 9.4 36 4 female Grp C
#> 75 9.4 45 4 female Grp C
#> 76 9.5 52 4 female Grp C
#> 77 9.6 35 4 female Grp C
#> 78 9.8 43 3 female Grp C
#> 79 9.9 47 3 female Grp C
#> 80 10.0 44 3 female Grp C
or by,
View(data)
View the first and last six observations,
head(data)
#> chol age exercise sex categ
#> 1 6.5 38 6 male Grp A
#> 2 6.6 35 5 male Grp A
#> 3 6.8 39 6 male Grp A
#> 4 6.8 36 5 male Grp A
#> 5 6.9 31 4 male Grp A
#> 6 7.0 38 4 male Grp A
tail(data)
#> chol age exercise sex categ
#> 75 9.4 45 4 female Grp C
#> 76 9.5 52 4 female Grp C
#> 77 9.6 35 4 female Grp C
#> 78 9.8 43 3 female Grp C
#> 79 9.9 47 3 female Grp C
#> 80 10.0 44 3 female Grp C
These are very useful for very long datasets, whenever we just want to get some idea about the data layout.
Basic information about the data,
str(data)
#> 'data.frame': 80 obs. of 5 variables:
#> $ chol : num 6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
#> $ age : num 38 35 39 36 31 38 33 36 40 34 ...
#> $ exercise: num 6 5 6 5 4 4 5 5 4 6 ...
#> $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
#> $ categ : Factor w/ 3 levels "Grp A","Grp B",..: 1 1 1 1 1 1 1 1 1 1 ...
#> - attr(*, "variable.labels")= Named chr [1:5] "cholesterol in mmol/L" "age in year" "duration of exercise (hours/week)" "" ...
#> ..- attr(*, "names")= chr [1:5] "chol" "age" "exercise" "sex" ...
#> - attr(*, "codepage")= int 65001
Most important, from this information
'data.frame': 80 obs. of 5 variables
we know that we have a data frame, with 80 observations and five variables. The names of the variables are listed in the output. ## Selecting parts of data Select a variable (column),
data$age
#> [1] 38 35 39 36 31 38 33 36 40 34 38 40 40 28 37 38 49 29 40 38 34 46 42 38 32
#> [26] 43 42 40 38 39 39 39 35 38 40 38 45 36 31 34 44 35 40 37 33 46 42 40 45 42
#> [51] 45 38 34 44 39 38 39 47 41 44 30 48 47 42 42 49 31 38 38 48 34 45 45 36 45
#> [76] 52 35 43 47 44
or
data[ , "age"]
#> [1] 38 35 39 36 31 38 33 36 40 34 38 40 40 28 37 38 49 29 40 38 34 46 42 38 32
#> [26] 43 42 40 38 39 39 39 35 38 40 38 45 36 31 34 44 35 40 37 33 46 42 40 45 42
#> [51] 45 38 34 44 39 38 39 47 41 44 30 48 47 42 42 49 31 38 38 48 34 45 45 36 45
#> [76] 52 35 43 47 44
or by column number
data[ , 2]
#> [1] 38 35 39 36 31 38 33 36 40 34 38 40 40 28 37 38 49 29 40 38 34 46 42 38 32
#> [26] 43 42 40 38 39 39 39 35 38 40 38 45 36 31 34 44 35 40 37 33 46 42 40 45 42
#> [51] 45 38 34 44 39 38 39 47 41 44 30 48 47 42 42 49 31 38 38 48 34 45 45 36 45
#> [76] 52 35 43 47 44
Select observations (rows),
data[10:20, ]
#> chol age exercise sex categ
#> 10 7.2 34 6 male Grp A
#> 11 7.3 38 6 male Grp A
#> 12 7.3 40 5 male Grp A
#> 13 7.3 40 4 male Grp A
#> 14 7.3 28 5 male Grp A
#> 15 7.3 37 5 male Grp A
#> 16 7.4 38 4 male Grp A
#> 17 7.4 49 5 male Grp A
#> 18 7.4 29 5 male Grp A
#> 19 7.5 40 3 male Grp A
#> 20 7.6 38 5 male Grp A
Select observations and variables,
data[10:20, c("age", "categ")]
#> age categ
#> 10 34 Grp A
#> 11 38 Grp A
#> 12 40 Grp A
#> 13 40 Grp A
#> 14 28 Grp A
#> 15 37 Grp A
#> 16 38 Grp A
#> 17 49 Grp A
#> 18 29 Grp A
#> 19 40 Grp A
#> 20 38 Grp A
or
data[10:20, c(2, 5)]
#> age categ
#> 10 34 Grp A
#> 11 38 Grp A
#> 12 40 Grp A
#> 13 40 Grp A
#> 14 28 Grp A
#> 15 37 Grp A
#> 16 38 Grp A
#> 17 49 Grp A
#> 18 29 Grp A
#> 19 40 Grp A
#> 20 38 Grp A
data[c(1:10, 20:30), c("age", "categ")]
#> age categ
#> 1 38 Grp A
#> 2 35 Grp A
#> 3 39 Grp A
#> 4 36 Grp A
#> 5 31 Grp A
#> 6 38 Grp A
#> 7 33 Grp A
#> 8 36 Grp A
#> 9 40 Grp A
#> 10 34 Grp A
#> 20 38 Grp A
#> 21 34 Grp A
#> 22 46 Grp A
#> 23 42 Grp A
#> 24 38 Grp A
#> 25 32 Grp A
#> 26 43 Grp B
#> 27 42 Grp B
#> 28 40 Grp B
#> 29 38 Grp B
#> 30 39 Grp B
More advanced row selection is done using
subset(data, row, column) function in R. >
(more than) is called condition. You can try with the rest of
conditions: < (less than), >= (more than
or equal), <= (less than or equal), ==
(equal), != (not equal).
subset(data, age > 35, c(age, categ))
#> age categ
#> 1 38 Grp A
#> 3 39 Grp A
#> 4 36 Grp A
#> 6 38 Grp A
#> 8 36 Grp A
#> 9 40 Grp A
#> 11 38 Grp A
#> 12 40 Grp A
#> 13 40 Grp A
#> 15 37 Grp A
#> 16 38 Grp A
#> 17 49 Grp A
#> 19 40 Grp A
#> 20 38 Grp A
#> 22 46 Grp A
#> 23 42 Grp A
#> 24 38 Grp A
#> 26 43 Grp B
#> 27 42 Grp B
#> 28 40 Grp B
#> 29 38 Grp B
#> 30 39 Grp B
#> 31 39 Grp B
#> 32 39 Grp B
#> 34 38 Grp B
#> 35 40 Grp B
#> 36 38 Grp B
#> 37 45 Grp B
#> 38 36 Grp B
#> 41 44 Grp B
#> 43 40 Grp B
#> 44 37 Grp B
#> 46 46 Grp B
#> 47 42 Grp B
#> 48 40 Grp B
#> 49 45 Grp B
#> 50 42 Grp B
#> 51 45 Grp B
#> 52 38 Grp B
#> 54 44 Grp B
#> 55 39 Grp B
#> 56 38 Grp B
#> 57 39 Grp B
#> 58 47 Grp B
#> 59 41 Grp C
#> 60 44 Grp C
#> 62 48 Grp C
#> 63 47 Grp C
#> 64 42 Grp C
#> 65 42 Grp C
#> 66 49 Grp C
#> 68 38 Grp C
#> 69 38 Grp C
#> 70 48 Grp C
#> 72 45 Grp C
#> 73 45 Grp C
#> 74 36 Grp C
#> 75 45 Grp C
#> 76 52 Grp C
#> 78 43 Grp C
#> 79 47 Grp C
#> 80 44 Grp C
You can also specify a combination of two or more conditions using
& (and) and | (or).
subset(data, age > 35 & sex == "female", c(age, sex, categ))
#> age sex categ
#> 41 44 female Grp B
#> 43 40 female Grp B
#> 44 37 female Grp B
#> 46 46 female Grp B
#> 47 42 female Grp B
#> 48 40 female Grp B
#> 49 45 female Grp B
#> 50 42 female Grp B
#> 51 45 female Grp B
#> 52 38 female Grp B
#> 54 44 female Grp B
#> 55 39 female Grp B
#> 56 38 female Grp B
#> 57 39 female Grp B
#> 58 47 female Grp B
#> 59 41 female Grp C
#> 60 44 female Grp C
#> 62 48 female Grp C
#> 63 47 female Grp C
#> 64 42 female Grp C
#> 65 42 female Grp C
#> 66 49 female Grp C
#> 68 38 female Grp C
#> 69 38 female Grp C
#> 70 48 female Grp C
#> 72 45 female Grp C
#> 73 45 female Grp C
#> 74 36 female Grp C
#> 75 45 female Grp C
#> 76 52 female Grp C
#> 78 43 female Grp C
#> 79 47 female Grp C
#> 80 44 female Grp C
*I covered for builtin functions. Keep in mind that you may also use
dplyr() or tidyverse styles. This can be
easily asked from ChatGPT/Gemini.
Create age in month,
data$age_month = data$age * 12
data$age_month
#> [1] 456 420 468 432 372 456 396 432 480 408 456 480 480 336 444 456 588 348 480
#> [20] 456 408 552 504 456 384 516 504 480 456 468 468 468 420 456 480 456 540 432
#> [39] 372 408 528 420 480 444 396 552 504 480 540 504 540 456 408 528 468 456 468
#> [58] 564 492 528 360 576 564 504 504 588 372 456 456 576 408 540 540 432 540 624
#> [77] 420 516 564 528
Histogram of age,
hist(data$age)
abline(v = 35, col = "red")
abline(v = 45, col = "red")
We decide to cut at 35 and 45 years old,
data$age_cat = cut(data$age, breaks = c(-Inf, 35, 45, Inf),
labels = c("<35", "35-45", ">45"))
levels(data$age_cat) # view the newly created categories/levels
#> [1] "<35" "35-45" ">45"
table(data$age_cat) # view counts per categories
#>
#> <35 35-45 >45
#> 18 52 10
We need car package to use recode()
function,
library(car)
#> Loading required package: carData
data$age_cat1 = recode(data$age_cat,
"c('35-45', '>45') = '35 & above'")
levels(data$age_cat1)
#> [1] "<35" "35 & above"
table(data$age_cat1)
#>
#> <35 35 & above
#> 18 62
Pay attention to the use of " " and ' ' in
recode().
We again use cholest.sav dataset. Now we give it a
proper object name cholest,
library(foreign)
cholest = read.spss("cholest.sav", to.data.frame = T)
str(cholest)
#> 'data.frame': 80 obs. of 5 variables:
#> $ chol : num 6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
#> $ age : num 38 35 39 36 31 38 33 36 40 34 ...
#> $ exercise: num 6 5 6 5 4 4 5 5 4 6 ...
#> $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
#> $ categ : Factor w/ 3 levels "Grp A","Grp B",..: 1 1 1 1 1 1 1 1 1 1 ...
#> - attr(*, "variable.labels")= Named chr [1:5] "cholesterol in mmol/L" "age in year" "duration of exercise (hours/week)" "" ...
#> ..- attr(*, "names")= chr [1:5] "chol" "age" "exercise" "sex" ...
#> - attr(*, "codepage")= int 65001
In general, simple descriptive statistics can be obtained using
summary() function,
summary(cholest)
#> chol age exercise sex categ
#> Min. : 6.50 Min. :28.00 Min. :2.000 female:40 Grp A:25
#> 1st Qu.: 7.60 1st Qu.:36.00 1st Qu.:4.000 male :40 Grp B:33
#> Median : 8.30 Median :39.00 Median :4.000 Grp C:22
#> Mean : 8.23 Mean :39.48 Mean :4.225
#> 3rd Qu.: 8.80 3rd Qu.:43.25 3rd Qu.:5.000
#> Max. :10.00 Max. :52.00 Max. :6.000
The results depend on the variable type. ## Central tendency and dispersion For numerical variables, we can obtain the measures of central tendency (mean and median) and dispersion (standard deviation, SD and interquartile range, IQR). Now we obtain in pairs of mean (SD) and median (IQR), Mean,
mean(cholest$chol)
#> [1] 8.23
mean(cholest$age)
#> [1] 39.475
Standard deviation, SD,
sd(cholest$chol)
#> [1] 0.8386849
sd(cholest$age)
#> [1] 5.128661
Median,
median(cholest$chol)
#> [1] 8.3
median(cholest$age)
#> [1] 39
and interquartile range, IQR,
IQR(cholest$chol)
#> [1] 1.2
IQR(cholest$age)
#> [1] 7.25
*Try exploring psych package for descriptive statistics.
## Proportions For categorical variables, we want to obtain the count
per group, proportions and percentages.
The count per group using table() function (we can also
obtain the counts from summary() function as done
before),
tab_sex = table(cholest$sex)
tab_categ = table(cholest$categ)
tab_sex
#>
#> female male
#> 40 40
tab_categ
#>
#> Grp A Grp B Grp C
#> 25 33 22
The proportions,
prop.table(tab_sex)
#>
#> female male
#> 0.5 0.5
prop.table(tab_categ)
#>
#> Grp A Grp B Grp C
#> 0.3125 0.4125 0.2750
and to obtain the percentages, we multiply the proportions by 100,
prop.table(tab_sex)*100
#>
#> female male
#> 50 50
prop.table(tab_categ)*100
#>
#> Grp A Grp B Grp C
#> 31.25 41.25 27.50
For numerical variables, we can obtain the statistics by groups (the
categorical variables) using by() function. The syntax is
by(numerical_variable, categorical_variable, function).
Mean and SD for chol by sex,
by(cholest$chol, cholest$sex, mean)
#> cholest$sex: female
#> [1] 8.9275
#> ------------------------------------------------------------
#> cholest$sex: male
#> [1] 7.5325
by(cholest$chol, cholest$sex, sd)
#> cholest$sex: female
#> [1] 0.4551627
#> ------------------------------------------------------------
#> cholest$sex: male
#> [1] 0.4687066
For categorical variables, it is important to be able to perform
cross-tabulation to explore the count per cells for each combination of
groups. Again, we use table() function.
For sex and categ, we obtain the basic
cross-tabulation,
tab_sex_categ = table(Gender = cholest$sex, Category = cholest$categ)
tab_sex_categ
#> Category
#> Gender Grp A Grp B Grp C
#> female 0 18 22
#> male 25 15 0
Notice we can give headers (“Gender” and “Category”) to groups in the table as shown above.
We can also easily obtain the proportions and percentages,
prop_sex_categ = prop.table(tab_sex_categ)
prop_sex_categ
#> Category
#> Gender Grp A Grp B Grp C
#> female 0.0000 0.2250 0.2750
#> male 0.3125 0.1875 0.0000
per_sex_categ = prop.table(tab_sex_categ)*100
per_sex_categ
#> Category
#> Gender Grp A Grp B Grp C
#> female 0.00 22.50 27.50
#> male 31.25 18.75 0.00
and add the marginal counts,
margin_sex_categ = addmargins(tab_sex_categ)
margin_sex_categ
#> Category
#> Gender Grp A Grp B Grp C Sum
#> female 0 18 22 40
#> male 25 15 0 40
#> Sum 25 33 22 80
and view the proportions and percentages again, including that of the marginal counts,
addmargins(prop_sex_categ)
#> Category
#> Gender Grp A Grp B Grp C Sum
#> female 0.0000 0.2250 0.2750 0.5000
#> male 0.3125 0.1875 0.0000 0.5000
#> Sum 0.3125 0.4125 0.2750 1.0000
addmargins(per_sex_categ)
#> Category
#> Gender Grp A Grp B Grp C Sum
#> female 0.00 22.50 27.50 50.00
#> male 31.25 18.75 0.00 50.00
#> Sum 31.25 41.25 27.50 100.00
lattice packageWe are going to use lattice package for plotting the
plots. It creates nice plots in R and easy to explore multivariate data
(i.e. many variables) and comparing plots by groups.
Load lattice package,
library(lattice)
We again use cholest.sav dataset. We give it
cholest object name,
library(foreign)
cholest = read.spss("cholest.sav", to.data.frame = TRUE)
and view the basic information of the dataset,
str(cholest)
#> 'data.frame': 80 obs. of 5 variables:
#> $ chol : num 6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
#> $ age : num 38 35 39 36 31 38 33 36 40 34 ...
#> $ exercise: num 6 5 6 5 4 4 5 5 4 6 ...
#> $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
#> $ categ : Factor w/ 3 levels "Grp A","Grp B",..: 1 1 1 1 1 1 1 1 1 1 ...
#> - attr(*, "variable.labels")= Named chr [1:5] "cholesterol in mmol/L" "age in year" "duration of exercise (hours/week)" "" ...
#> ..- attr(*, "names")= chr [1:5] "chol" "age" "exercise" "sex" ...
#> - attr(*, "codepage")= int 65001
head(cholest)
#> chol age exercise sex categ
#> 1 6.5 38 6 male Grp A
#> 2 6.6 35 5 male Grp A
#> 3 6.8 39 6 male Grp A
#> 4 6.8 36 5 male Grp A
#> 5 6.9 31 4 male Grp A
#> 6 7.0 38 4 male Grp A
Plot the histogram for chol,
histogram(~ chol, data = cholest)
Then plot the histogram for chol by sex and
categ groups,
histogram(~ chol | sex, data = cholest)
histogram(~ chol | categ, data = cholest)
To make the comparison easier, we can adjust the layout,
histogram(~ chol | sex, data = cholest, layout = c(1, 2))
histogram(~ chol | categ, data = cholest, layout = c(1, 3))
e.g. layout = c(1, 2) means 1 column (over the x-axis)
and 2 rows (along the y-axis).
Plot the box-and-whisker plot for chol,
bwplot(~ chol, data = cholest)
Then plot the box-and-whisker plot for chol by
sex and categ groups,
bwplot(sex ~ chol, data = cholest)
bwplot(categ ~ chol, data = cholest)
Notice the grouping is specified before ~ sign, which
set sex and categ on y-axis. The grouping can
also be specified on x-axis for nicer-looking plots,
bwplot(chol ~ sex, data = cholest)
bwplot(chol ~ categ, data = cholest)
For categorical variables, we can plot bar charts. But, first we need the count per group for the categorical variables,
table_sex = table(cholest$sex)
table_sex
#>
#> female male
#> 40 40
table_categ = table(cholest$categ)
table_categ
#>
#> Grp A Grp B Grp C
#> 25 33 22
Then using the counts, we can plot the charts,
barchart(table_sex)
barchart(table_categ)
For a more flexible chart setting, convert the table to a data frame,
let say for table_categ
df_categ = as.data.frame(table_categ)
colnames(df_categ) = c("Category", "Count") # set the column names
df_categ
#> Category Count
#> 1 Grp A 25
#> 2 Grp B 33
#> 3 Grp C 22
Then, we can plot with Category as x-axis,
barchart(Count ~ Category, data = df_categ)
We can also plot cross-tabulated groups
(e.g. sex vs categ). First, properly prepare the data
frame,
table_sex_categ = table(cholest$sex, cholest$categ) # obtain the count
df_sex_categ = as.data.frame(table_sex_categ) # save as data frame
colnames(df_sex_categ) = c("Sex", "Category", "Count") # give proper names
Then, plot the barcharts,
barchart(Count ~ Sex | Category, df_sex_categ, layout = c(3, 1))
barchart(Count ~ Category | Sex, df_sex_categ, layout = c(2, 1))
Plot a scatter plot to explore the relationship between cholestrol
and age (y-axis chol vs x-axis age),
xyplot(chol ~ age, data = cholest)
We can also explore the relationship by groups,
xyplot(chol ~ age | sex, data = cholest)
xyplot(chol ~ age | categ, data = cholest, layout = c(3,1))
There seem to be some patterns of cholestrol levels by groups.