#Import data
Countries<- read.csv("https://tkelleman.github.io/tkweb/Week5/countries_total.csv")
Income<-read.csv("https://tkelleman.github.io/tkweb/Week5/income_per_person.csv")
LifeExp<-read.csv("https://tkelleman.github.io/tkweb/Week5/life_expectancy_years.csv")
Population<-read.csv("https://tkelleman.github.io/tkweb/Week5/population_total.csv")

## A - Reshape data set: Income Per Person to make a longitudinal data such that the resulting data set has three columns: country, year, and income.
IncomeEdit <- Income %>%
  gather(key = "Year", value = "Income", - geo, na.rm=TRUE)
names(IncomeEdit)[1] <- "Country"

## B - Do the same for Life Expectancy in Years so that the resulting data set has three columns: country, year, and life expectancy.
LifeExpEdit <- LifeExp %>%
  gather(key = "Year", value = "LifeExpectancy", - geo, na.rm=TRUE)
names(LifeExpEdit)[1] <- "Country"

## C -  Merge/join the above two longitudinal data sets to make a new data set, under name LifeExpIncom that has variables: country, year, lifeExp, and income.
LifeExpIncom <- merge(IncomeEdit, LifeExpEdit, by = c("Country", "Year"))

## D - Merge LifeExpIncom with country region so that the final data set has information about income, life expectancy, and country region.
LifeIncomCount <-merge(LifeExpIncom, Countries, by.x = "Country", by.y = "name", all.x = TRUE)

## E -  Merge the previous resulting data set with population size so that the final data set has information about income, life expectancy, population size, and country region.
PopulationEdit <- Population %>%
  gather(key = "Year", value = "Population", - geo, na.rm=TRUE)
names(PopulationEdit)[1] <- "Country"

FinalData <- merge(LifeIncomCount, PopulationEdit, by = c("Country", "Year"))
FinalData <- subset(FinalData, select = -c(alpha.2, alpha.3, country.code, iso_3166.2, sub.region, region.code, sub.region.code, intermediate.region.code, intermediate.region))
FinalData$Year<-gsub("X","", as.character(FinalData$Year))

#write.csv(FinalData, "FinalData.csv")

## 3 Create a subset of the above resulting longitudinal data set that contains only the data of the year 2000/font> - name it 2000data.
Y2000data<-filter(FinalData, Year==2000)

#Y2000data contains 187 observations of 6 variables
#FinalData contains 40,437 observations of 6 variables


Each of the four data sets CSV files (countries_total, income_per_person, life_expectancy_years, and population_total) were imported into R for data preparation. Exact steps of the data preparation are documented in the comments in the code above, press “show” button. income_per_person and life_expectancy_years were reshaped from latitudinal to longitudinal to only include the columns: “country”, “year”, and “life expectancy” and merged into one data set. This data set was then merged into a “FinalData” dataset that included countries_total and population_total formatted to only include information about income, life expectancy, population size, and country region.

Y2015data<-filter(FinalData, Year==2015)
#Y2015data<-formatC(Y2015data$Population, format = "d", big.mark = ",")
plot2015<-plot_ly(
      data = Y2015data,
      x=~Income, 
      y=~LifeExpectancy, 
      alpha = 0.8,
      size = ~Population, 
      color = ~Country,
      width = 2,
      frame = ~Year,
      text = ~paste("Country:", Country,
                    "<br>Life Expectancy:", LifeExpectancy,
                    "<br>Population:", Population),
      hoverinfo = "text",
      type = "scatter",
      mode = "markers"
)%>%
layout(
  title =list(text = "Interactive Plot of Life Expectancy by Income in 2015", 
                  font = list(family = "Times New Roman",
                                size = 18,
                               color = "black"))
  )
plot2015


Figure 1: Interactive Plot of Life Expectancy by Income in 2015

This interactive plot shows the relationship between life expectancy and income in the year 2015 in the merged data set. The population of each country is represented by the diameter of the points plotted and the color of the point represents the country. As shown in this figure, countries in the regions of Americas and Europe have a higher life expectancy than most countries in the Africa region, with a few outlier data points. As each point is selected, the country, life expectancy, and population is displayed.

### Full Data Set
cols1 = c("#332288","#117733","#44AA99","#88CCEE","#DDCC77","#CC6677")
AllDataPlot<-ggplot(FinalData, aes(x=Income, 
                      y=LifeExpectancy, 
                      size = Population, 
                      color = region)) +
  geom_point(aes(size=Population, ids = Country),
            show.legend = TRUE,
            alpha = 0.8)+
            scale_color_manual(values = cols1)+
    labs(title = 'Plot of Life Expectency by Income at Year: {closest_state}',
       x = 'Income',
       y = 'Life Expectancy',
       fill =  'Region')+
    transition_states(Year) +
    ease_aes('linear')

animate(AllDataPlot, renderer = gifski_renderer(), rewind = FALSE, nframes = 300, fps = 10)


Figure 2: Plot of Life Expectancy by Income for Years 1800-2018

This plot shows the relationship between life expectancy and income all countries in the merged data set. The population of each country is represented by the diameter of the points plotted and the color of the point represents the region of the country. As shown in the plot, all represented countries increase their life expectancy between the given years but not all countries show an increase in income.

