1 Data management

1.1 Reading data

1.1.1 Basic file:

.csv (Spreadsheet/Excel)

data = read.csv("cholest.csv")

1.1.2 Other formats:

.sav (SPSS) and .dta (STATA)

Require foreign package:

library(foreign)

Read SPSS file, .sav,

data = read.spss("cholest.sav", to.data.frame = TRUE)

Read STATA files, .dta,

data = read.dta("cholest.dta")

1.2 Viewing data

Let’s continue using cholest.sav dataset,

data = read.spss("cholest.sav", to.data.frame = TRUE)

View full data by,

data
#>    chol age exercise    sex categ
#> 1   6.5  38        6   male Grp A
#> 2   6.6  35        5   male Grp A
#> 3   6.8  39        6   male Grp A
#> 4   6.8  36        5   male Grp A
#> 5   6.9  31        4   male Grp A
#> 6   7.0  38        4   male Grp A
#> 7   7.0  33        5   male Grp A
#> 8   7.2  36        5   male Grp A
#> 9   7.2  40        4   male Grp A
#> 10  7.2  34        6   male Grp A
#> 11  7.3  38        6   male Grp A
#> 12  7.3  40        5   male Grp A
#> 13  7.3  40        4   male Grp A
#> 14  7.3  28        5   male Grp A
#> 15  7.3  37        5   male Grp A
#> 16  7.4  38        4   male Grp A
#> 17  7.4  49        5   male Grp A
#> 18  7.4  29        5   male Grp A
#> 19  7.5  40        3   male Grp A
#> 20  7.6  38        5   male Grp A
#> 21  7.6  34        5   male Grp A
#> 22  7.6  46        4   male Grp A
#> 23  7.6  42        5   male Grp A
#> 24  7.6  38        4   male Grp A
#> 25  7.8  32        5   male Grp A
#> 26  7.8  43        4   male Grp B
#> 27  7.8  42        5   male Grp B
#> 28  7.8  40        5   male Grp B
#> 29  7.8  38        5   male Grp B
#> 30  7.9  39        5   male Grp B
#> 31  7.9  39        5   male Grp B
#> 32  7.9  39        5   male Grp B
#> 33  8.0  35        3   male Grp B
#> 34  8.0  38        4   male Grp B
#> 35  8.1  40        5   male Grp B
#> 36  8.1  38        4   male Grp B
#> 37  8.2  45        6   male Grp B
#> 38  8.2  36        4   male Grp B
#> 39  8.3  31        4   male Grp B
#> 40  8.3  34        5   male Grp B
#> 41  8.3  44        4 female Grp B
#> 42  8.3  35        5 female Grp B
#> 43  8.4  40        4 female Grp B
#> 44  8.4  37        6 female Grp B
#> 45  8.5  33        4 female Grp B
#> 46  8.5  46        4 female Grp B
#> 47  8.5  42        5 female Grp B
#> 48  8.5  40        4 female Grp B
#> 49  8.5  45        4 female Grp B
#> 50  8.5  42        5 female Grp B
#> 51  8.5  45        4 female Grp B
#> 52  8.6  38        5 female Grp B
#> 53  8.6  34        3 female Grp B
#> 54  8.6  44        4 female Grp B
#> 55  8.7  39        3 female Grp B
#> 56  8.7  38        4 female Grp B
#> 57  8.7  39        3 female Grp B
#> 58  8.8  47        3 female Grp B
#> 59  8.8  41        4 female Grp C
#> 60  8.8  44        4 female Grp C
#> 61  8.8  30        3 female Grp C
#> 62  8.9  48        3 female Grp C
#> 63  8.9  47        4 female Grp C
#> 64  8.9  42        4 female Grp C
#> 65  9.0  42        4 female Grp C
#> 66  9.0  49        3 female Grp C
#> 67  9.1  31        2 female Grp C
#> 68  9.2  38        3 female Grp C
#> 69  9.2  38        3 female Grp C
#> 70  9.3  48        3 female Grp C
#> 71  9.3  34        4 female Grp C
#> 72  9.3  45        3 female Grp C
#> 73  9.4  45        3 female Grp C
#> 74  9.4  36        4 female Grp C
#> 75  9.4  45        4 female Grp C
#> 76  9.5  52        4 female Grp C
#> 77  9.6  35        4 female Grp C
#> 78  9.8  43        3 female Grp C
#> 79  9.9  47        3 female Grp C
#> 80 10.0  44        3 female Grp C

or by,

View(data)

View the first and last six observations,

head(data)
#>   chol age exercise  sex categ
#> 1  6.5  38        6 male Grp A
#> 2  6.6  35        5 male Grp A
#> 3  6.8  39        6 male Grp A
#> 4  6.8  36        5 male Grp A
#> 5  6.9  31        4 male Grp A
#> 6  7.0  38        4 male Grp A
tail(data)
#>    chol age exercise    sex categ
#> 75  9.4  45        4 female Grp C
#> 76  9.5  52        4 female Grp C
#> 77  9.6  35        4 female Grp C
#> 78  9.8  43        3 female Grp C
#> 79  9.9  47        3 female Grp C
#> 80 10.0  44        3 female Grp C

These are very useful for very long datasets, whenever we just want to get some idea about the data layout.

Basic information about the data,

str(data)
#> 'data.frame':    80 obs. of  5 variables:
#>  $ chol    : num  6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
#>  $ age     : num  38 35 39 36 31 38 33 36 40 34 ...
#>  $ exercise: num  6 5 6 5 4 4 5 5 4 6 ...
#>  $ sex     : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
#>  $ categ   : Factor w/ 3 levels "Grp A","Grp B",..: 1 1 1 1 1 1 1 1 1 1 ...
#>  - attr(*, "variable.labels")= Named chr [1:5] "cholesterol in mmol/L" "age in year" "duration of exercise (hours/week)" "" ...
#>   ..- attr(*, "names")= chr [1:5] "chol" "age" "exercise" "sex" ...
#>  - attr(*, "codepage")= int 65001

Most important, from this information

'data.frame': 80 obs. of 5 variables

we know that we have a data frame, with 80 observations and five variables. The names of the variables are listed in the output. ## Selecting parts of data Select a variable (column),

data$age
#>  [1] 38 35 39 36 31 38 33 36 40 34 38 40 40 28 37 38 49 29 40 38 34 46 42 38 32
#> [26] 43 42 40 38 39 39 39 35 38 40 38 45 36 31 34 44 35 40 37 33 46 42 40 45 42
#> [51] 45 38 34 44 39 38 39 47 41 44 30 48 47 42 42 49 31 38 38 48 34 45 45 36 45
#> [76] 52 35 43 47 44

or

data[ , "age"]
#>  [1] 38 35 39 36 31 38 33 36 40 34 38 40 40 28 37 38 49 29 40 38 34 46 42 38 32
#> [26] 43 42 40 38 39 39 39 35 38 40 38 45 36 31 34 44 35 40 37 33 46 42 40 45 42
#> [51] 45 38 34 44 39 38 39 47 41 44 30 48 47 42 42 49 31 38 38 48 34 45 45 36 45
#> [76] 52 35 43 47 44

or by column number

data[ , 2]
#>  [1] 38 35 39 36 31 38 33 36 40 34 38 40 40 28 37 38 49 29 40 38 34 46 42 38 32
#> [26] 43 42 40 38 39 39 39 35 38 40 38 45 36 31 34 44 35 40 37 33 46 42 40 45 42
#> [51] 45 38 34 44 39 38 39 47 41 44 30 48 47 42 42 49 31 38 38 48 34 45 45 36 45
#> [76] 52 35 43 47 44

Select observations (rows),

data[10:20, ]
#>    chol age exercise  sex categ
#> 10  7.2  34        6 male Grp A
#> 11  7.3  38        6 male Grp A
#> 12  7.3  40        5 male Grp A
#> 13  7.3  40        4 male Grp A
#> 14  7.3  28        5 male Grp A
#> 15  7.3  37        5 male Grp A
#> 16  7.4  38        4 male Grp A
#> 17  7.4  49        5 male Grp A
#> 18  7.4  29        5 male Grp A
#> 19  7.5  40        3 male Grp A
#> 20  7.6  38        5 male Grp A

Select observations and variables,

data[10:20, c("age", "categ")]
#>    age categ
#> 10  34 Grp A
#> 11  38 Grp A
#> 12  40 Grp A
#> 13  40 Grp A
#> 14  28 Grp A
#> 15  37 Grp A
#> 16  38 Grp A
#> 17  49 Grp A
#> 18  29 Grp A
#> 19  40 Grp A
#> 20  38 Grp A

or

data[10:20, c(2, 5)]
#>    age categ
#> 10  34 Grp A
#> 11  38 Grp A
#> 12  40 Grp A
#> 13  40 Grp A
#> 14  28 Grp A
#> 15  37 Grp A
#> 16  38 Grp A
#> 17  49 Grp A
#> 18  29 Grp A
#> 19  40 Grp A
#> 20  38 Grp A
data[c(1:10, 20:30), c("age", "categ")]
#>    age categ
#> 1   38 Grp A
#> 2   35 Grp A
#> 3   39 Grp A
#> 4   36 Grp A
#> 5   31 Grp A
#> 6   38 Grp A
#> 7   33 Grp A
#> 8   36 Grp A
#> 9   40 Grp A
#> 10  34 Grp A
#> 20  38 Grp A
#> 21  34 Grp A
#> 22  46 Grp A
#> 23  42 Grp A
#> 24  38 Grp A
#> 25  32 Grp A
#> 26  43 Grp B
#> 27  42 Grp B
#> 28  40 Grp B
#> 29  38 Grp B
#> 30  39 Grp B

More advanced row selection is done using subset(data, row, column) function in R. > (more than) is called condition. You can try with the rest of conditions: < (less than), >= (more than or equal), <= (less than or equal), == (equal), != (not equal).

subset(data, age > 35, c(age, categ))
#>    age categ
#> 1   38 Grp A
#> 3   39 Grp A
#> 4   36 Grp A
#> 6   38 Grp A
#> 8   36 Grp A
#> 9   40 Grp A
#> 11  38 Grp A
#> 12  40 Grp A
#> 13  40 Grp A
#> 15  37 Grp A
#> 16  38 Grp A
#> 17  49 Grp A
#> 19  40 Grp A
#> 20  38 Grp A
#> 22  46 Grp A
#> 23  42 Grp A
#> 24  38 Grp A
#> 26  43 Grp B
#> 27  42 Grp B
#> 28  40 Grp B
#> 29  38 Grp B
#> 30  39 Grp B
#> 31  39 Grp B
#> 32  39 Grp B
#> 34  38 Grp B
#> 35  40 Grp B
#> 36  38 Grp B
#> 37  45 Grp B
#> 38  36 Grp B
#> 41  44 Grp B
#> 43  40 Grp B
#> 44  37 Grp B
#> 46  46 Grp B
#> 47  42 Grp B
#> 48  40 Grp B
#> 49  45 Grp B
#> 50  42 Grp B
#> 51  45 Grp B
#> 52  38 Grp B
#> 54  44 Grp B
#> 55  39 Grp B
#> 56  38 Grp B
#> 57  39 Grp B
#> 58  47 Grp B
#> 59  41 Grp C
#> 60  44 Grp C
#> 62  48 Grp C
#> 63  47 Grp C
#> 64  42 Grp C
#> 65  42 Grp C
#> 66  49 Grp C
#> 68  38 Grp C
#> 69  38 Grp C
#> 70  48 Grp C
#> 72  45 Grp C
#> 73  45 Grp C
#> 74  36 Grp C
#> 75  45 Grp C
#> 76  52 Grp C
#> 78  43 Grp C
#> 79  47 Grp C
#> 80  44 Grp C

You can also specify a combination of two or more conditions using & (and) and | (or).

subset(data, age > 35 & sex == "female", c(age, sex, categ))
#>    age    sex categ
#> 41  44 female Grp B
#> 43  40 female Grp B
#> 44  37 female Grp B
#> 46  46 female Grp B
#> 47  42 female Grp B
#> 48  40 female Grp B
#> 49  45 female Grp B
#> 50  42 female Grp B
#> 51  45 female Grp B
#> 52  38 female Grp B
#> 54  44 female Grp B
#> 55  39 female Grp B
#> 56  38 female Grp B
#> 57  39 female Grp B
#> 58  47 female Grp B
#> 59  41 female Grp C
#> 60  44 female Grp C
#> 62  48 female Grp C
#> 63  47 female Grp C
#> 64  42 female Grp C
#> 65  42 female Grp C
#> 66  49 female Grp C
#> 68  38 female Grp C
#> 69  38 female Grp C
#> 70  48 female Grp C
#> 72  45 female Grp C
#> 73  45 female Grp C
#> 74  36 female Grp C
#> 75  45 female Grp C
#> 76  52 female Grp C
#> 78  43 female Grp C
#> 79  47 female Grp C
#> 80  44 female Grp C

*I covered for builtin functions. Keep in mind that you may also use dplyr() or tidyverse styles. This can be easily asked from ChatGPT/Gemini.

1.3 Modifying data

1.3.1 Creating new variables

Create age in month,

data$age_month = data$age * 12
data$age_month
#>  [1] 456 420 468 432 372 456 396 432 480 408 456 480 480 336 444 456 588 348 480
#> [20] 456 408 552 504 456 384 516 504 480 456 468 468 468 420 456 480 456 540 432
#> [39] 372 408 528 420 480 444 396 552 504 480 540 504 540 456 408 528 468 456 468
#> [58] 564 492 528 360 576 564 504 504 588 372 456 456 576 408 540 540 432 540 624
#> [77] 420 516 564 528

1.3.2 Recoding variables

Histogram of age,

hist(data$age)
abline(v = 35, col = "red")
abline(v = 45, col = "red")

We decide to cut at 35 and 45 years old,

data$age_cat = cut(data$age, breaks = c(-Inf, 35, 45, Inf),
                   labels = c("<35", "35-45", ">45"))
levels(data$age_cat)  # view the newly created categories/levels
#> [1] "<35"   "35-45" ">45"
table(data$age_cat)  # view counts per categories
#> 
#>   <35 35-45   >45 
#>    18    52    10

We need car package to use recode() function,

library(car)
#> Loading required package: carData
data$age_cat1 = recode(data$age_cat, 
                       "c('35-45', '>45') = '35 & above'")
levels(data$age_cat1)
#> [1] "<35"        "35 & above"
table(data$age_cat1)
#> 
#>        <35 35 & above 
#>         18         62

Pay attention to the use of " " and ' ' in recode().

2 Descriptive Statistics

We again use cholest.sav dataset. Now we give it a proper object name cholest,

library(foreign)
cholest = read.spss("cholest.sav", to.data.frame = T)
str(cholest)
#> 'data.frame':    80 obs. of  5 variables:
#>  $ chol    : num  6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
#>  $ age     : num  38 35 39 36 31 38 33 36 40 34 ...
#>  $ exercise: num  6 5 6 5 4 4 5 5 4 6 ...
#>  $ sex     : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
#>  $ categ   : Factor w/ 3 levels "Grp A","Grp B",..: 1 1 1 1 1 1 1 1 1 1 ...
#>  - attr(*, "variable.labels")= Named chr [1:5] "cholesterol in mmol/L" "age in year" "duration of exercise (hours/week)" "" ...
#>   ..- attr(*, "names")= chr [1:5] "chol" "age" "exercise" "sex" ...
#>  - attr(*, "codepage")= int 65001

In general, simple descriptive statistics can be obtained using summary() function,

summary(cholest)
#>       chol            age           exercise         sex       categ   
#>  Min.   : 6.50   Min.   :28.00   Min.   :2.000   female:40   Grp A:25  
#>  1st Qu.: 7.60   1st Qu.:36.00   1st Qu.:4.000   male  :40   Grp B:33  
#>  Median : 8.30   Median :39.00   Median :4.000               Grp C:22  
#>  Mean   : 8.23   Mean   :39.48   Mean   :4.225                         
#>  3rd Qu.: 8.80   3rd Qu.:43.25   3rd Qu.:5.000                         
#>  Max.   :10.00   Max.   :52.00   Max.   :6.000

The results depend on the variable type. ## Central tendency and dispersion For numerical variables, we can obtain the measures of central tendency (mean and median) and dispersion (standard deviation, SD and interquartile range, IQR). Now we obtain in pairs of mean (SD) and median (IQR), Mean,

mean(cholest$chol)
#> [1] 8.23
mean(cholest$age)
#> [1] 39.475

Standard deviation, SD,

sd(cholest$chol)
#> [1] 0.8386849
sd(cholest$age)
#> [1] 5.128661

Median,

median(cholest$chol)
#> [1] 8.3
median(cholest$age)
#> [1] 39

and interquartile range, IQR,

IQR(cholest$chol)
#> [1] 1.2
IQR(cholest$age)
#> [1] 7.25

*Try exploring psych package for descriptive statistics. ## Proportions For categorical variables, we want to obtain the count per group, proportions and percentages.

The count per group using table() function (we can also obtain the counts from summary() function as done before),

tab_sex = table(cholest$sex)
tab_categ = table(cholest$categ)
tab_sex
#> 
#> female   male 
#>     40     40
tab_categ
#> 
#> Grp A Grp B Grp C 
#>    25    33    22

The proportions,

prop.table(tab_sex)
#> 
#> female   male 
#>    0.5    0.5
prop.table(tab_categ)
#> 
#>  Grp A  Grp B  Grp C 
#> 0.3125 0.4125 0.2750

and to obtain the percentages, we multiply the proportions by 100,

prop.table(tab_sex)*100
#> 
#> female   male 
#>     50     50
prop.table(tab_categ)*100
#> 
#> Grp A Grp B Grp C 
#> 31.25 41.25 27.50

2.1 Statistics by groups

For numerical variables, we can obtain the statistics by groups (the categorical variables) using by() function. The syntax is by(numerical_variable, categorical_variable, function).

Mean and SD for chol by sex,

by(cholest$chol, cholest$sex, mean)
#> cholest$sex: female
#> [1] 8.9275
#> ------------------------------------------------------------ 
#> cholest$sex: male
#> [1] 7.5325
by(cholest$chol, cholest$sex, sd)
#> cholest$sex: female
#> [1] 0.4551627
#> ------------------------------------------------------------ 
#> cholest$sex: male
#> [1] 0.4687066

2.2 Cross-tabulation

For categorical variables, it is important to be able to perform cross-tabulation to explore the count per cells for each combination of groups. Again, we use table() function.

For sex and categ, we obtain the basic cross-tabulation,

tab_sex_categ = table(Gender = cholest$sex, Category = cholest$categ)
tab_sex_categ
#>         Category
#> Gender   Grp A Grp B Grp C
#>   female     0    18    22
#>   male      25    15     0

Notice we can give headers (“Gender” and “Category”) to groups in the table as shown above.

We can also easily obtain the proportions and percentages,

prop_sex_categ = prop.table(tab_sex_categ)
prop_sex_categ
#>         Category
#> Gender    Grp A  Grp B  Grp C
#>   female 0.0000 0.2250 0.2750
#>   male   0.3125 0.1875 0.0000
per_sex_categ = prop.table(tab_sex_categ)*100
per_sex_categ
#>         Category
#> Gender   Grp A Grp B Grp C
#>   female  0.00 22.50 27.50
#>   male   31.25 18.75  0.00

and add the marginal counts,

margin_sex_categ = addmargins(tab_sex_categ)
margin_sex_categ
#>         Category
#> Gender   Grp A Grp B Grp C Sum
#>   female     0    18    22  40
#>   male      25    15     0  40
#>   Sum       25    33    22  80

and view the proportions and percentages again, including that of the marginal counts,

addmargins(prop_sex_categ)
#>         Category
#> Gender    Grp A  Grp B  Grp C    Sum
#>   female 0.0000 0.2250 0.2750 0.5000
#>   male   0.3125 0.1875 0.0000 0.5000
#>   Sum    0.3125 0.4125 0.2750 1.0000
addmargins(per_sex_categ)
#>         Category
#> Gender    Grp A  Grp B  Grp C    Sum
#>   female   0.00  22.50  27.50  50.00
#>   male    31.25  18.75   0.00  50.00
#>   Sum     31.25  41.25  27.50 100.00

3 Graphical Exploration of Data

3.1 Plotting using lattice package

We are going to use lattice package for plotting the plots. It creates nice plots in R and easy to explore multivariate data (i.e. many variables) and comparing plots by groups.

Load lattice package,

library(lattice)

3.2 Histogram

We again use cholest.sav dataset. We give it cholest object name,

library(foreign)
cholest = read.spss("cholest.sav", to.data.frame = TRUE)

and view the basic information of the dataset,

str(cholest)
#> 'data.frame':    80 obs. of  5 variables:
#>  $ chol    : num  6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
#>  $ age     : num  38 35 39 36 31 38 33 36 40 34 ...
#>  $ exercise: num  6 5 6 5 4 4 5 5 4 6 ...
#>  $ sex     : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
#>  $ categ   : Factor w/ 3 levels "Grp A","Grp B",..: 1 1 1 1 1 1 1 1 1 1 ...
#>  - attr(*, "variable.labels")= Named chr [1:5] "cholesterol in mmol/L" "age in year" "duration of exercise (hours/week)" "" ...
#>   ..- attr(*, "names")= chr [1:5] "chol" "age" "exercise" "sex" ...
#>  - attr(*, "codepage")= int 65001
head(cholest)
#>   chol age exercise  sex categ
#> 1  6.5  38        6 male Grp A
#> 2  6.6  35        5 male Grp A
#> 3  6.8  39        6 male Grp A
#> 4  6.8  36        5 male Grp A
#> 5  6.9  31        4 male Grp A
#> 6  7.0  38        4 male Grp A

3.2.1 Histogram

Plot the histogram for chol,

histogram(~ chol, data = cholest)

Then plot the histogram for chol by sex and categ groups,

histogram(~ chol | sex, data = cholest)

histogram(~ chol | categ, data = cholest)

To make the comparison easier, we can adjust the layout,

histogram(~ chol | sex, data = cholest, layout = c(1, 2))

histogram(~ chol | categ, data = cholest, layout = c(1, 3))

e.g. layout = c(1, 2) means 1 column (over the x-axis) and 2 rows (along the y-axis).

3.3 Box-and-whisker plot

Plot the box-and-whisker plot for chol,

bwplot(~ chol, data = cholest)

Then plot the box-and-whisker plot for chol by sex and categ groups,

bwplot(sex ~ chol, data = cholest)

bwplot(categ ~ chol, data = cholest)

Notice the grouping is specified before ~ sign, which set sex and categ on y-axis. The grouping can also be specified on x-axis for nicer-looking plots,

bwplot(chol ~ sex, data = cholest)

bwplot(chol ~ categ, data = cholest)

3.4 Bar chart

For categorical variables, we can plot bar charts. But, first we need the count per group for the categorical variables,

table_sex = table(cholest$sex)
table_sex
#> 
#> female   male 
#>     40     40
table_categ = table(cholest$categ)
table_categ
#> 
#> Grp A Grp B Grp C 
#>    25    33    22

Then using the counts, we can plot the charts,

barchart(table_sex)

barchart(table_categ)

For a more flexible chart setting, convert the table to a data frame, let say for table_categ

df_categ = as.data.frame(table_categ)
colnames(df_categ) = c("Category", "Count")  # set the column names
df_categ
#>   Category Count
#> 1    Grp A    25
#> 2    Grp B    33
#> 3    Grp C    22

Then, we can plot with Category as x-axis,

barchart(Count ~ Category, data = df_categ)

We can also plot cross-tabulated groups (e.g. sex vs categ). First, properly prepare the data frame,

table_sex_categ = table(cholest$sex, cholest$categ)  # obtain the count
df_sex_categ = as.data.frame(table_sex_categ)  # save as data frame
colnames(df_sex_categ) = c("Sex", "Category", "Count")  # give proper names

Then, plot the barcharts,

barchart(Count ~ Sex | Category, df_sex_categ, layout = c(3, 1))

barchart(Count ~ Category | Sex, df_sex_categ, layout = c(2, 1))

3.5 Scatter plot

Plot a scatter plot to explore the relationship between cholestrol and age (y-axis chol vs x-axis age),

xyplot(chol ~ age, data = cholest)

We can also explore the relationship by groups,

xyplot(chol ~ age | sex, data = cholest)

xyplot(chol ~ age | categ, data = cholest, layout = c(3,1))

There seem to be some patterns of cholestrol levels by groups.