SQL Data Analysis - Covid-19

By Elad Oz Cohen in SQL


Data used: https://ourworldindata.org/

Q1: Exploring infections death rates both at a global scale and at a continent scale.

/* Using CTE to create a derived table with the relevant data at the continent scale */
WITH continents_average AS (
    SELECT 'Continent Average' AS 'Location', 
           CONVERT(DATE,v.date,103) AS 'Date',
           ROUND(SUM(cd.new_cases)/COUNT(DISTINCT cd.continent),0) AS 'New Infections',
           ROUND(SUM(v.daily_vaccinations)/COUNT(DISTINCT cd.continent),0) AS 'Vaccinations',
           SUM(CAST(cd.new_deaths AS INT))/COUNT(DISTINCT cd.continent) AS 'New Deaths'
    FROM vaccinations v 
    INNER JOIN coviddeaths cd ON cd.date = v.date AND cd.iso_code = v.iso_code
    WHERE cd.Location NOT LIKE '%income%' AND cd.Location NOT LIKE '%world%' AND cd.Location != cd.continent
    GROUP BY CONVERT(DATE,v.date,103)
),

/* Creating another CTE to create a derived table with the relevant data for the global scale */
continents_cases AS (
    SELECT cd.continent AS 'Location',
           CONVERT(DATE,v.date,103) AS 'Date',
           SUM(cd.new_cases) AS 'New Infections',
           SUM(v.daily_vaccinations) AS 'Vaccinations',
           SUM(CAST (cd.new_deaths AS INT)) AS 'New Deaths'
    FROM vaccinations v 
    INNER JOIN coviddeaths cd ON cd.date = v.date AND cd.location = v.location
    WHERE cd.Location NOT LIKE '%income%' AND cd.Location NOT LIKE '%world%' AND cd.Location != cd.continent
    GROUP BY cd.continent, CONVERT(DATE,v.date,103)
)

/* Combining the two tables using UNION ALL */
SELECT 'Continent Average' AS 'Location', 
       [Date], 
       [New Infections], 
       [Vaccinations], 
       [New Deaths]
FROM continents_average
UNION ALL
SELECT [Location], 
       [Date], 
       [New Infections], 
       [Vaccinations], 
       [New Deaths]
FROM continents_cases;

Visual Results




Q2: Investigating what are the top 10 countries with the highest percentage of their population fully vaccinated.

Note: The calculation was executed for large countries defined as countries with population over 3 million citizens

/* Creating a temporary table having the calculated column percentage of population fully vaccinated, along with some descriptive information */

DROP TABLE IF EXISTS my_fully_vac_table;

SELECT v.location AS 'Country/Territory',
       v.people_fully_vaccinated,
       v.date,
       cd.population,
       (v.people_fully_vaccinated/cd.population)* 100 AS 'Percent Population Fully Vaccinated'
INTO my_fully_vac_table
FROM  vaccinations v 
INNER JOIN coviddeaths cd ON cd.date = v.date AND cd.location = v.location
WHERE cd.location != cd.continent /* Removing duplicated rows from the dataset */
GROUP BY v.location, v.people_fully_vaccinated ,cd.population, v.date
HAVING cd.population > 3000000;


/* Creating another temporary table that contains the relevant information of the top 10 countries that were the first to reach herd immunity (70% of population fully vaccinated) */

DROP TABLE IF EXISTS top_10_vac;

SELECT TOP 10 [Country/Territory], 
             MAX(ROUND([Percent Population Fully Vaccinated],2)) AS 'Percent Vaccinated' 
INTO top_10_vac
FROM my_fully_vac_table
GROUP BY [Country/Territory]
HAVING MAX([Percent Population Fully Vaccinated]) <= 100 /* Note: due to vaccination of non-residence, some countries exceeded vaccinating 100% of their population, thus removed from output*/
ORDER BY 2 DESC;


SELECT * 
FROM top_10_vac;



Here I’m creating (and pre-processing) data i’ll be using in the next questions.

/*Creating a temporary table and inserting the relevent data + */
DROP TABLE IF EXISTS percent_table;

CREATE TABLE  percent_table (Country NVARCHAR(20), 
              fully_vaccinated BIGINT, 
              population_size BIGINT,
              date_time DATE, 
              percent_fully_vaccinated FLOAT);


INSERT INTO percent_table(Country, fully_vaccinated, population_size, date_time, percent_fully_vaccinated)

SELECT  v.location,
		v.people_fully_vaccinated,
		cd.population,
		cd.date,
	    (v.people_fully_vaccinated/cd.population)* 100 
FROM  vaccinations v INNER JOIN coviddeaths  cd ON cd.date = v.date AND cd.location = v.location
WHERE ((v.people_fully_vaccinated/cd.population)* 100) >= 70  /* This condition filters countries who vaccinated less then 70% of their population*/ 

     AND
     
	   cd.location NOT LIKE '%income%' AND cd.Location NOT LIKE '%world%' /* This condition removes irrelevent observations for the current analysis (mainly: global and regional data) */


GROUP BY v.location, cd.date, v.people_fully_vaccinated ,cd.population
HAVING cd.population > 3000000 




/* Finally, deleting countries with over 100% of population vaccintaed.
(note: these countries are small countries that also vaccinated non-residence people in their teritory.)*/

DELETE FROM percent_table
WHERE Country IN (
    SELECT Country
    FROM percent_table
    WHERE percent_fully_vaccinated > 100)

Q3: Listing the first 10 countries that were the first to reach herd immunity.

Note: Although a controversial topic, for the sake of practice I choose to define herd immunity as countries that at least 70% of their population is fully vaccinated.

WITH temp_table
AS(
SELECT *, ROW_NUMBER() OVER(PARTITION BY Country ORDER BY percent_fully_vaccinated) 'ranked'
FROM percent_table)


SELECT  TOP 10 Country,  date_time, percent_fully_vaccinated
FROM temp_table
WHERE ranked = 1
ORDER BY date_time
Country Date
Uruguay 2021-08-12
Singapore 2021-08-14
Portugal 2021-08-27
Belgium 2021-08-30
Denmark 2021-08-31
Spain 2021-08-31
Chile 2021-09-02
Ireland 2021-09-06
China 2021-09-15
Wales 2021-09-24

It appears that not all those who were to first to reach herd immunity also ended being the most (fully) vaccinated countries. Perhaps the rate of vaccinations in former countries reached a plateau upon reaching herd immunity. The next question investigates this hypothesis.


Q4: Investegating how the vacination rates have changed across time for top countries who were among the first to reach herd immunity but weren’t also the top countries who vaccinated most of their population.


/*Using CTE and windows-functions, extracting the relevent data for those countries 
who were the first to reach herd immunity while not beign the top country to fully vaccinate their population. */


WITH ranked_countries
AS(
SELECT *, ROW_NUMBER() OVER(PARTITION BY Country ORDER BY percent_fully_vaccinated) 'ranked'
FROM percent_table),

herd_table
AS (
	SELECT  TOP 10 Country,  date_time, percent_fully_vaccinated
	FROM ranked_countries
	WHERE ranked = 1
	ORDER BY date_time),
	
	
top_vac
AS(
	SELECT v.location 'Country',
	v.people_fully_vaccinated,
	cd.population,
	(v.people_fully_vaccinated/cd.population)* 100 'Percent Population Fully Vacinated'
	FROM  vaccinations v INNER JOIN coviddeaths  cd 
	ON cd.date = v.date AND cd.location = v.location
	WHERE cd.location != cd.continent
	GROUP BY v.location, v.people_fully_vaccinated ,cd.population
	HAVING cd.population > 3000000)





SELECT *
FROM my_fully_vac_table
WHERE Country IN 
				(SELECT Country
				FROM herd_table 
				WHERE Country NOT IN (SELECT Country FROM top_10_vac))

ORDER BY Country, date



It appears my hypothesize is correct as the slope of the vaccination rate is reaching asymptotic around august, which is the month that most countries in Q2 have reached herd immunity.

Posted on:
January 1, 0001
Length:
5 minute read, 900 words
Categories:
SQL
Tags:
hugo-site
See Also:
SQL Marketing Campaign Analysis
Mean and median Simulation using R
Python Web Scraping Project