This notebook will cover the basics of working with databases.
To do so, we will be working with a database called country.sqlite
which you can download here. This database was developed by Dr. Eric Dunford for a previous iteration of this class. It contains a set of country-level information over years, and spread across different tables.
As we saw in the slides, we can use SQL in Python and build a connection with database systems through pandas. Here are te steps:
Establish a connection between your local python environment and the database stored locally
Write a SQL query
Execute the query
Pull the result and work with the result in that language
Let's see this in practice
## import pandas and sqlite3
import pandas as pd
import sqlite3
# Establish a connection with the database
conn = sqlite3.connect("country.sqlite")
# write a query
query = "select * from econ"
# just text, wuery
query
# Use the connection with the read_sql() method
pd.read_sql(query,conn)
As noticed before, the output is always a table. From now on, I will show you some different SQL commands and its applications
pd.read_sql("select name from sqlite_master where type='table'",conn)
# build query
query = """
select *
from life
limit 5
"""
#query database
pd.read_sql(query,conn)
# build query
query = """select country, year, polity
from regime
limit 5"""
#query database
pd.read_sql(query,conn)
# build query
query = """select country, ccode, year, riots
from conflict
order by riots"""
#query database
pd.read_sql(query,conn).head()
# build query
query = """select country, ccode, year, riots
from conflict
order by riots desc"""
#query database
pd.read_sql(query,conn).head()
query = """select country, ccode, year, riots
from conflict
where country = 'Nigeria' """
#query database
pd.read_sql(query,conn).head()
query= """
select country, ccode, year, riots
from conflict
where country = 'Nigeria' and year >= 2000
"""
#query database
pd.read_sql(query,conn).head()
query = "select country, ccode, year, riots, protests \
from conflict \
where riots > 20 or protests > 20 "
#query database
pd.read_sql(query,conn).head()
query = "select \
country as COUNTRY_NAME, \
year as YEAR, \
polity as POLITY \
from regime \
limit 20"
#query database
pd.read_sql(query,conn)
query = "select country, ccode, year, riots, protests, \
riots + protests as displays \
from conflict \
order by displays desc"
#query database
pd.read_sql(query,conn).head()
query = "select country, ccode, year, protests, \
(case when protests > 0 then 'Protest' \
else 'No Protest' end) as protest_onset \
from conflict "
#query database
pd.read_sql(query,conn)
query = "select country, ccode, \
cast(year as integer) as year, \
cast(riots as float) as riots, \
protests \
from conflict \
order by riots desc \
limit 5"
#query database
pd.read_sql(query,conn)
query = "select \
distinct country \
from econ "
#query database
pd.read_sql(query,conn).head()
query = "select \
count(*) as n_obs \
from conflict "
#query database
pd.read_sql(query,conn)
query = "select \
count(distinct country) as n_countries \
from conflict "
#query database
pd.read_sql(query,conn)
query = "select \
round(avg(riots),2) as ave_riots, \
sum(riots) as total_riots, \
min(riots) as min_riots, \
max(riots) as max_riots \
from conflict "
#query database
pd.read_sql(query,conn)
query = "select \
country, \
year, \
sum(riots) as riots, \
sum(protests) as protests \
from conflict \
group by country, year "
#query database
pd.read_sql(query,conn).head()
query = "select country, count(*) as n \
from conflict \
group by country "
#query database
pd.read_sql(query,conn).head()
Subqueries allows us to combine several different operations across multiple tables
query = " WITH algeria_a as ( \
SELECT \
country,\
year,\
'bombings' as type,\
explosions_remote_violence as n_events\
FROM conflict\
WHERE country = 'Algeria'),\
algeria_b as (\
SELECT \
country,\
year,\
'vac' as type,\
violence_against_civilians as n_events\
FROM conflict\
WHERE country = 'Algeria')\
select * from algeria_a\
UNION ALL\
select * from algeria_b\
"
#query database
pd.read_sql(query,conn).head()
query = """select
life.*,
econ.ln_gdppc
from life
inner join econ on (life.ccode = econ.ccode and life.year=econ.year)
"""
#query database
pd.read_sql(query,conn)
query = """select
a.*,
b.ln_gdppc
from life a
inner join econ b on (a.ccode = b.ccode and a.year=b.year)
"""
#query database
pd.read_sql(query,conn)
"""select
a.*,
b.ln_gdppc
from life a
left join econ b on (a.ccode = b.ccode and a.year=b.year)
"""
#query database
pd.read_sql(query,conn)
query = """select
a.*,
b.ln_gdppc,
c.regime_type
from life a
left join econ b on (a.ccode = b.ccode and a.year=b.year)
left join regime c on (a.ccode = c.ccode and a.year=c.year)
"""
#query database
pd.read_sql(query,conn)
!jupyter nbconvert _week_13_sql.ipynb --to html --template classic