1 Read data

We have these files: - cholest.csv - cholest.sav - cholest.dta - cholest.xlsx

*Always make sure that you set the working directory first!

data.csv = read.csv("cholest.csv")  #most natural way to open data in R
data.csv

library(foreign)  #library to read .sav (SPSS) and .dta (STATA) files
data.sav = read.spss("cholest.sav", to.data.frame = TRUE)  #SPSS
data.sav

data.dta = read.dta("cholest.dta")  #STATA
data.dta

library(readxl)  #library to read excel files, must install first
data.xls = read_excel("cholest.xlsx", sheet = 1)
data.xls

2 Handle data

2.1 Basics

str(data.csv)  #Basic info

## 'data.frame':    80 obs. of  5 variables:
##  $ chol    : num  6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
##  $ age     : int  38 35 39 36 31 38 33 36 40 34 ...
##  $ exercise: int  6 5 6 5 4 4 5 5 4 6 ...
##  $ sex     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ categ   : int  0 0 0 0 0 0 0 0 0 0 ...

dim(data.csv)  #Dimension (row/case column/variable)

## [1] 80  5

names(data.csv)  #Variable names

## [1] "chol"     "age"      "exercise" "sex"      "categ"

2.2 View data

head(data.csv)  #View data, first 6 rows

tail(data.csv)  #View data, last 6 rows

data.csv  #View all

View(data.csv)  # View in a new tab instead of console

2.3 Select specific parts of data (subsetting)

data.csv$age  #View "age" only

##  [1] 38 35 39 36 31 38 33 36 40 34 38 40 40 28 37 38 49 29 40 38 34 46 42 38 32 43 42 40
## [29] 38 39 39 39 35 38 40 38 45 36 31 34 44 35 40 37 33 46 42 40 45 42 45 38 34 44 39 38
## [57] 39 47 41 44 30 48 47 42 42 49 31 38 38 48 34 45 45 36 45 52 35 43 47 44

data.csv["age"]

data.csv[2]

In general, syntax data[row(number/name), col(number/name)]

data.csv[1:10, 2:4]  #Row 1 to 10; col 2 to 4

data.csv[c(1,3,5,7,9), c("age", "chol")]  #Row 1,3,5,7,9; col age & chol

data.csv[data.csv["age"] == 38, c("age", "chol")]  #Row age = 38; col age & chol

data.csv[data.csv["sex"] == 1, c("sex", "chol")]  #Row Sex = 1; col sex & chol

Can also use subset(), syntax subset(data, condition, variable)

subset(data.csv, age == 38)

subset(data.csv, age == 38, age:sex)

3 Basic analysis

We use data.sav, with category labels

str(data.sav)  #numerical = num, categorical = Factor

## 'data.frame':    80 obs. of  5 variables:
##  $ chol    : num  6.5 6.6 6.8 6.8 6.9 7 7 7.2 7.2 7.2 ...
##  $ age     : num  38 35 39 36 31 38 33 36 40 34 ...
##  $ exercise: num  6 5 6 5 4 4 5 5 4 6 ...
##  $ sex     : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ categ   : Factor w/ 3 levels "Grp A","Grp B",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "variable.labels")= Named chr [1:5] "cholesterol in mmol/L" "age in year" "duration of exercise (hours/week)" "" ...
##   ..- attr(*, "names")= chr [1:5] "chol" "age" "exercise" "sex" ...
##  - attr(*, "codepage")= int 65001

summary(data.sav)

##       chol            age           exercise         sex       categ   
##  Min.   : 6.50   Min.   :28.00   Min.   :2.000   female:40   Grp A:25  
##  1st Qu.: 7.60   1st Qu.:36.00   1st Qu.:4.000   male  :40   Grp B:33  
##  Median : 8.30   Median :39.00   Median :4.000               Grp C:22  
##  Mean   : 8.23   Mean   :39.48   Mean   :4.225                         
##  3rd Qu.: 8.80   3rd Qu.:43.25   3rd Qu.:5.000                         
##  Max.   :10.00   Max.   :52.00   Max.   :6.000

3.1 Numerical

library(psych)  #to use describe
describe(data.sav[c("chol","age", "exercise")])

3.2 Categorical

table(data.sav$sex)

## 
## female   male 
##     40     40

table(data.sav$categ)

## 
## Grp A Grp B Grp C 
##    25    33    22

4 Plots

4.1 Histogram

hist(data.sav$chol)

4.2 Boxplot

boxplot(data.sav$chol)

4.3 Scatter plot

plot(data.sav$age, data.sav$chol)
abline(lm(chol ~ age, data = data.sav))  #need two lines of codes

4.4 Bar chart

count = table(data.sav$sex)
barplot(count, col = c("blue", "red"))

5 More R in …

Exploring Data Using R

Book: https://penerbit.usm.my/index.php/buku/425-exploring-data-using-r

References

R Core Team. (2022). Foreign: Read data stored by minitab, s, SAS, SPSS, stata, systat, weka, dBase, ... Retrieved from https://svn.r-project.org/R-packages/trunk/foreign/

Revelle, W. (2023). Psych: Procedures for psychological, psychometric, and personality research. Retrieved from https://personality-project.org/r/psych/ https://personality-project.org/r/psych-manual.pdf

Wickham, H., & Bryan, J. (2023). Readxl: Read excel files. Retrieved from https://CRAN.R-project.org/package=readxl

Introduction to R – Basics

Note updated May 09, 2023. Not for sale :-)

Wan Nor Arifin (wnarifin@usm.my),
Biostatistics and Research Methodology Unit, Universiti Sains Malaysia
Website: wnarifin.github.io

This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.

1 Read data

2 Handle data

2.1 Basics

2.2 View data

2.3 Select specific parts of data (subsetting)

3 Basic analysis

3.1 Numerical

3.2 Categorical

4 Plots

4.1 Histogram

4.2 Boxplot

4.3 Scatter plot

4.4 Bar chart

5 More R in …

References

Introduction to R – Basics

Note updated May 09, 2023. Not for sale :-)

Wan Nor Arifin (wnarifin@usm.my),Biostatistics and Research Methodology Unit, Universiti Sains MalaysiaWebsite: wnarifin.github.io

This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.

1 Read data

2 Handle data

2.1 Basics

2.2 View data

2.3 Select specific parts of data (subsetting)

3 Basic analysis

3.1 Numerical

3.2 Categorical

4 Plots

4.1 Histogram

4.2 Boxplot

4.3 Scatter plot

4.4 Bar chart

5 More R in …

References

Wan Nor Arifin (wnarifin@usm.my),
Biostatistics and Research Methodology Unit, Universiti Sains Malaysia
Website: wnarifin.github.io