Importing the data

Cleaning the data

library(readxl) # use package readxl
HDR25_Statistical_Annex_HDI_Table <- read_excel("HDR25_Statistical_Annex_HDI_Table.xlsx") # read and name file
View(HDR25_Statistical_Annex_HDI_Table)
HDR25_Statistical_Annex_HDI_Table # view file
str(HDR25_Statistical_Annex_HDI_Table) # view data types for each column
## tibble [278 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ...1                                               : chr [1:278] NA NA NA NA ...
##  $ Table 1. Human Development Index and its components: chr [1:278] NA NA NA NA ...
##  $ ...3                                               : chr [1:278] NA NA NA "Human Development Index (HDI)" ...
##  $ ...4                                               : logi [1:278] NA NA NA NA NA NA ...
##  $ ...5                                               : chr [1:278] NA "SDG3" NA "Life expectancy at birth" ...
##  $ ...6                                               : chr [1:278] NA NA NA NA ...
##  $ ...7                                               : chr [1:278] NA "SDG4.3" NA "Expected years of schooling" ...
##  $ ...8                                               : chr [1:278] NA NA NA NA ...
##  $ ...9                                               : chr [1:278] NA "SDG4.4" NA "Mean years of schooling" ...
##  $ ...10                                              : chr [1:278] NA NA NA NA ...
##  $ ...11                                              : chr [1:278] NA "SDG8.5" NA "Gross national income (GNI) per capita" ...
##  $ ...12                                              : chr [1:278] NA NA NA NA ...
##  $ ...13                                              : chr [1:278] NA NA NA "GNI per capita rank minus HDI rank" ...
##  $ ...14                                              : chr [1:278] NA NA NA NA ...
##  $ ...15                                              : chr [1:278] NA NA NA "HDI rank" ...
head(HDR25_Statistical_Annex_HDI_Table) # view first 6 rows of data
tail(HDR25_Statistical_Annex_HDI_Table) # view last 6 rows of data
as.list(HDR25_Statistical_Annex_HDI_Table[4,]) # get a list of the fourth row which is where the real column names are
## $...1
## [1] NA
## 
## $`Table 1. Human Development Index and its components`
## [1] NA
## 
## $...3
## [1] "Human Development Index (HDI)"
## 
## $...4
## [1] NA
## 
## $...5
## [1] "Life expectancy at birth"
## 
## $...6
## [1] NA
## 
## $...7
## [1] "Expected years of schooling"
## 
## $...8
## [1] NA
## 
## $...9
## [1] "Mean years of schooling"
## 
## $...10
## [1] NA
## 
## $...11
## [1] "Gross national income (GNI) per capita"
## 
## $...12
## [1] NA
## 
## $...13
## [1] "GNI per capita rank minus HDI rank"
## 
## $...14
## [1] NA
## 
## $...15
## [1] "HDI rank"
colnames(HDR25_Statistical_Annex_HDI_Table) <- as.list(HDR25_Statistical_Annex_HDI_Table[4,]) # use the fourth row to rename the columns

names(HDR25_Statistical_Annex_HDI_Table)[2]=c("Country") # change the name of the second column to country since it was in a different row than the others
## Warning: The `value` argument of `names<-()` can't be empty as of tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
HDR25_Statistical_Annex_HDI_Table <- HDR25_Statistical_Annex_HDI_Table[-c(1:7),] # remove the first 7 rows

HDR25_Statistical_Annex_HDI_Table <-
HDR25_Statistical_Annex_HDI_Table[,-c(1,4,6,8,10,12,14,15)] # delete these columns that have nothing in them
HDR25_Statistical_Annex_HDI_Table # view data
HDR25_Statistical_Annex_HDI_Table <- HDR25_Statistical_Annex_HDI_Table[-c(75,126,170,197:271),] # delete the rows at the bottom that are not needed
colnames(HDR25_Statistical_Annex_HDI_Table) # view column names
## [1] "Country"                               
## [2] "Human Development Index (HDI)"         
## [3] "Life expectancy at birth"              
## [4] "Expected years of schooling"           
## [5] "Mean years of schooling"               
## [6] "Gross national income (GNI) per capita"
## [7] "GNI per capita rank minus HDI rank"
names(HDR25_Statistical_Annex_HDI_Table)[2]=c("Human_Dev_Index") # change column name to remove spaces
names(HDR25_Statistical_Annex_HDI_Table)[3]=c("Life_Exp") # change column name to remove spaces
names(HDR25_Statistical_Annex_HDI_Table)[4]=c("School_Exp") # change column name to remove spaces
names(HDR25_Statistical_Annex_HDI_Table)[5]=c("Mean_Yrs_School") # change column name to remove spaces
names(HDR25_Statistical_Annex_HDI_Table)[6]=c("Gross_Nat_Income_per_cap") # change column name to remove spaces
names(HDR25_Statistical_Annex_HDI_Table)[7]=c("Rank_GNI_minus_HDI") # change column name to remove spaces

colnames(HDR25_Statistical_Annex_HDI_Table) # view column names
## [1] "Country"                  "Human_Dev_Index"         
## [3] "Life_Exp"                 "School_Exp"              
## [5] "Mean_Yrs_School"          "Gross_Nat_Income_per_cap"
## [7] "Rank_GNI_minus_HDI"

Formatting the data

HDR25_Statistical_Annex_HDI_Table$Human_Dev_Index=as.numeric(HDR25_Statistical_Annex_HDI_Table$Human_Dev_Index) # change column to numeric data type
HDR25_Statistical_Annex_HDI_Table$Life_Exp=as.numeric(HDR25_Statistical_Annex_HDI_Table$Life_Exp) # change column to numeric data type
HDR25_Statistical_Annex_HDI_Table$School_Exp=as.numeric(HDR25_Statistical_Annex_HDI_Table$School_Exp) # change column to numeric data type
HDR25_Statistical_Annex_HDI_Table$Mean_Yrs_School=as.numeric(HDR25_Statistical_Annex_HDI_Table$Mean_Yrs_School) # change column to numeric data type
HDR25_Statistical_Annex_HDI_Table$Gross_Nat_Income_per_cap=as.numeric(HDR25_Statistical_Annex_HDI_Table$Gross_Nat_Income_per_cap) # change column to numeric data type
HDR25_Statistical_Annex_HDI_Table$Rank_GNI_minus_HDI=as.numeric(HDR25_Statistical_Annex_HDI_Table$Rank_GNI_minus_HDI) # change column to numeric data type

str(HDR25_Statistical_Annex_HDI_Table) # view data types for each column
## tibble [193 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Country                 : chr [1:193] "Iceland" "Norway" "Switzerland" "Denmark" ...
##  $ Human_Dev_Index         : num [1:193] 0.972 0.97 0.97 0.962 0.959 0.959 0.958 0.955 0.955 0.951 ...
##  $ Life_Exp                : num [1:193] 82.7 83.3 84 81.9 81.4 ...
##  $ School_Exp              : num [1:193] 18.9 18.8 16.7 18.7 17.3 ...
##  $ Mean_Yrs_School         : num [1:193] 13.9 13.1 13.9 13 14.3 ...
##  $ Gross_Nat_Income_per_cap: num [1:193] 69117 112710 81949 76008 64053 ...
##  $ Rank_GNI_minus_HDI      : num [1:193] 12 0 5 4 13 10 14 4 6 9 ...
summary(HDR25_Statistical_Annex_HDI_Table) # view statistical summary
##    Country          Human_Dev_Index     Life_Exp       School_Exp    
##  Length:193         Min.   :0.3880   Min.   :54.46   Min.   : 5.635  
##  Class :character   1st Qu.:0.6220   1st Qu.:67.39   1st Qu.:11.505  
##  Mode  :character   Median :0.7620   Median :73.49   Median :13.336  
##                     Mean   :0.7408   Mean   :73.11   Mean   :13.585  
##                     3rd Qu.:0.8620   3rd Qu.:78.34   3rd Qu.:15.888  
##                     Max.   :0.9720   Max.   :85.71   Max.   :20.846  
##  Mean_Yrs_School  Gross_Nat_Income_per_cap Rank_GNI_minus_HDI
##  Min.   : 1.412   Min.   :   688.3         Min.   :-54.0000  
##  1st Qu.: 6.780   1st Qu.:  5746.6         1st Qu.: -7.0000  
##  Median : 9.933   Median : 15866.5         Median :  1.0000  
##  Mean   : 9.173   Mean   : 24620.7         Mean   :  0.2021  
##  3rd Qu.:11.642   3rd Qu.: 36793.0         3rd Qu.: 10.0000  
##  Max.   :14.296   Max.   :166811.7         Max.   : 38.0000

Analyzing the data

library(dplyr) # use package dplyr
HDR25_Statistical_Annex_HDI_Table |>
  group_by() |>
  summarize(Life_Exp_Mean = mean(Life_Exp),
            School_Exp_Mean = mean(School_Exp),
            Mean_Yrs_School_Mean = mean(Mean_Yrs_School),
            Gross_Nat_Income_per_cap_Mean = mean(Gross_Nat_Income_per_cap)) # Calculate and show the mean of each column listed

Save and Export

saveRDS(HDR25_Statistical_Annex_HDI_Table,"HDR_Cleaned_Formatted.RDS") # code is saving the file as an RDS file
HDR_Cleaned_FormattedRDS=readRDS("HDR_Cleaned_Formatted.RDS") # code is reading the RDS file
str(HDR_Cleaned_FormattedRDS) # code is checking the data types of the RDS file
## tibble [193 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Country                 : chr [1:193] "Iceland" "Norway" "Switzerland" "Denmark" ...
##  $ Human_Dev_Index         : num [1:193] 0.972 0.97 0.97 0.962 0.959 0.959 0.958 0.955 0.955 0.951 ...
##  $ Life_Exp                : num [1:193] 82.7 83.3 84 81.9 81.4 ...
##  $ School_Exp              : num [1:193] 18.9 18.8 16.7 18.7 17.3 ...
##  $ Mean_Yrs_School         : num [1:193] 13.9 13.1 13.9 13 14.3 ...
##  $ Gross_Nat_Income_per_cap: num [1:193] 69117 112710 81949 76008 64053 ...
##  $ Rank_GNI_minus_HDI      : num [1:193] 12 0 5 4 13 10 14 4 6 9 ...
write.csv(HDR25_Statistical_Annex_HDI_Table,"HDR_Cleaned_Formatted.csv", row.names = FALSE) # code is saving the data frame as a csv file
HDR_Cleaned_FormattedCSV=read.csv("HDR_Cleaned_Formatted.csv") # code is reading the csv file
str(HDR_Cleaned_FormattedCSV) # code is checking the data types of the csv file
## 'data.frame':    193 obs. of  7 variables:
##  $ Country                 : chr  "Iceland" "Norway" "Switzerland" "Denmark" ...
##  $ Human_Dev_Index         : num  0.972 0.97 0.97 0.962 0.959 0.959 0.958 0.955 0.955 0.951 ...
##  $ Life_Exp                : num  82.7 83.3 84 81.9 81.4 ...
##  $ School_Exp              : num  18.9 18.8 16.7 18.7 17.3 ...
##  $ Mean_Yrs_School         : num  13.9 13.1 13.9 13 14.3 ...
##  $ Gross_Nat_Income_per_cap: num  69117 112710 81949 76008 64053 ...
##  $ Rank_GNI_minus_HDI      : int  12 0 5 4 13 10 14 4 6 9 ...