Data Frame Creation (Climate Change)
Library
Packages needed to run this code. Also initializing a lag function.
library(janitor) # Cleaning Data
library(tidyverse) # Data Manipulation
library(plm) # Panel Linear Models
library(stargazer) # Table Creation
library(readxl) # Read Excel Files
library(vdemdata) # V-dem Dataset
# Lag function
diff <- function(x) {x - dplyr::lag(x)}
V-Dem Data
Here we bring in the V-Dem data set which allows us to collect the yearly corruption values from 1999 to 2020. Other control variables are included.
vdem_df <-vdem
vdem_df_filtered <- vdem_df %>% filter(year %in% 1999:2020) %>%
select(country_name, year, country_text_id, v2x_corr, v2xnp_regcorr, e_gdppc,
e_pop) %>% rename(country= country_name, iso= country_text_id,
corr = v2x_corr, reg_corr = v2xnp_regcorr,
pop = e_pop, gdppc = e_gdppc) %>%
mutate(corr_rate = diff(corr),reg_rate = diff(reg_corr))
df <- vdem_df_filtered %>% filter(year %in% 2000:2020)
colSums(is.na(df)) # Identifying Missing Data
## country year iso corr reg_corr gdppc pop corr_rate
## 0 0 0 6 1 272 272 8
## reg_rate
## 2
Disaster Data
The disaster data is was taken from EM-DAT database. Here I filter by biological disaster. I also create three subgroups and create dummy columns by disaster type. This uses the fastDummies package.
dis_df <- read_excel("dataset2.xlsx", skip = 6) %>% clean_names()
dis_df1 <- dis_df %>% select(start_year, country,
region,iso, disaster_type, iso, disaster_subtype,
disaster_subgroup, contains("total")) %>%
rename(year = start_year) # Getting the Rows Needed
# Filtering by Biological Disaster
dis_df2 <- dis_df1 %>% filter(disaster_subgroup != "Biological") %>%
filter(disaster_type %in% c("Flood" , "Drought" , "Earthquake"))
unique(dis_df2$disaster_subgroup) # Three Subgroups
## [1] "Hydrological" "Climatological" "Geophysical"
dis_df3 <- fastDummies::dummy_cols(dis_df2, select_columns = "disaster_type") # Creating Dummy Columns
Amount of occurrences per disaster.
occurrences <- dis_df1 %>%
count(disaster_subgroup)
print(occurrences)
## # A tibble: 4 × 2
## disaster_subgroup n
## <chr> <int>
## 1 Climatological 599
## 2 Geophysical 687
## 3 Hydrological 3847
## 4 Meteorological 2610
Here I am adding in zeros, when there is no disaster for that subgroup.
dis_df4 <- dis_df3 %>%
mutate_if(is.numeric, ~replace_na(., 0))
Aggregating several variables, in particular the amount of damages and those affected.
names <- c("disaster_type_Drought", "disaster_type_Earthquake","disaster_type_Flood")
dataframe <- dis_df4 %>% group_by(country, year, iso) %>% summarise(death = sum(total_deaths),
affected = sum(total_affected),
damages= sum(total_damages_000_us),
occurences = n(),
n_floods = sum(disaster_type_Flood),
n_drought= sum(disaster_type_Drought),
n_earth = sum(disaster_type_Earthquake))
Merging
Merging the V-Dem data with the EM-DAT data. The final line collects how many missing values.
data_frame <- merge(df, dataframe, by=c("year", "iso"), all = TRUE) %>% rename(country = country.x) %>%
select(-country.y) # Combine V-Dem with Disaster Data
keep_row <-unique(dis_df$iso)
df2 <- data_frame %>% filter(iso %in% keep_row)
df3 <- df2 %>% filter(!is.na(country))
df4 <- mutate_if(df3[,10:16], is.numeric, ~replace_na(., 0))
final_df <- cbind(df3[,1:9],df4)
colSums(is.na(final_df))
## year iso country corr reg_corr gdppc pop
## 0 0 0 1 1 210 210
## corr_rate reg_rate death affected damages occurences n_floods
## 2 2 0 0 0 0 0
## n_drought n_earth
## 0 0
OECD
Now it is time to bring in the OECD data so we can understand what parts of the world were most affected.
filename <- "https://raw.githubusercontent.com/openclimatedata/countrygroups/main/data/oecd.csv"
mydata <-read_csv(url(filename))
names_oecd <- mydata$Name
final_df$oecd <- +(final_df$country %in% names_oecd)
final_df <- final_df %>% mutate(l_dam = log(1+damages), l_aff = log(1+affected),
l_death = log(1+death))
Adding Dummy Variables
Finally we create dummy variables for each variable type in the final data frame. The summary output only shows the first 10 columns to save space.
final_df$dis <- if_else(final_df$occurences > 0, 1, 0)
final_df$flood <- if_else(final_df$n_floods > 0, 1, 0)
final_df$drought <- if_else(final_df$n_drought > 0, 1, 0)
final_df$earth <- if_else(final_df$n_earth > 0, 1, 0)
selected_df <- final_df[, 1:10]
summary(selected_df)
## year iso country corr
## Min. :2000 Length:3559 Length:3559 Min. :0.0020
## 1st Qu.:2005 Class :character Class :character 1st Qu.:0.2060
## Median :2010 Mode :character Mode :character Median :0.5710
## Mean :2010 Mean :0.5118
## 3rd Qu.:2015 3rd Qu.:0.7887
## Max. :2020 Max. :0.9660
## NA's :1
## reg_corr gdppc pop corr_rate
## Min. :0.0020 Min. : 0.538 Min. : 8.76 Min. :-0.670000
## 1st Qu.:0.1893 1st Qu.: 3.069 1st Qu.: 378.19 1st Qu.: 0.000000
## Median :0.5640 Median : 8.874 Median : 1057.76 Median : 0.000000
## Mean :0.5055 Mean : 15.016 Mean : 4333.55 Mean :-0.001615
## 3rd Qu.:0.7750 3rd Qu.: 21.251 3rd Qu.: 2978.95 3rd Qu.: 0.000000
## Max. :0.9680 Max. :100.865 Max. :148256.42 Max. : 0.451000
## NA's :1 NA's :210 NA's :210 NA's :2
## reg_rate death
## Min. :-0.661000 Min. : 0.0
## 1st Qu.: 0.000000 1st Qu.: 0.0
## Median : 0.000000 Median : 0.0
## Mean :-0.002098 Mean : 239.7
## 3rd Qu.: 0.000000 3rd Qu.: 9.0
## Max. : 0.515000 Max. :222614.0
## NA's :2