Blog 1 -- Databases and Geographical Data Visualization
Creating a database
We would like to create a database, temps.db
, containing 3 tables: temperatures
, stations
, and countries
.
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import plotly
from plotly import express as px
import numpy as np
import sqlite3
from sklearn.linear_model import LinearRegression
from calendar import month_name
data_path = "/Users/lzj/work/pic16b/blog1data/"
#open database connection
conn = sqlite3.connect(data_path + "temps.db")
conn.execute("DROP TABLE IF EXISTS temperatures")
#ensure we don't later append to an existing copy
<sqlite3.Cursor at 0x7f7c88fa7030>
After opening the database connection, we read the temperatures data from a .csv
file. As this file is large, we read it into the database in chunks of 100000 rows. We will use the prepare_df
function to clean up the data chunks as we read it in.
df_iter = pd.read_csv(data_path + "temps.csv", chunksize = 100000)
# cleans temps data
def prepare_df(df):
df = df.set_index(keys=["ID", "Year"])
df = df.stack()
df = df.reset_index()
df = df.rename(columns = {"level_2" : "Month" , 0 : "Temp"})
df["Month"] = df["Month"].str[5:].astype(int)
df["Temp"] = df["Temp"] / 100
return(df)
for df in df_iter:
df = prepare_df(df)
df.to_sql("temperatures", conn, if_exists = "append", index = False)
The data on stations and countries are smaller and don’t have to be read in chunks.
stations = pd.read_csv(data_path + "station-metadata.csv")
# create FIPS code column for merging later
stations["FIPS 10-4"] = stations["ID"].str[0:2]
stations.to_sql("stations", conn, if_exists = "replace", index = False)
/Users/lzj/opt/anaconda3/envs/PIC16B/lib/python3.8/site-packages/pandas/core/generic.py:2872: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores.
sql.to_sql(
countries = pd.read_csv(data_path + "countries.csv")
countries.to_sql("countries", conn, if_exists = "replace", index = False)
conn.close()
2. Query the database
We can write a function that uses a SQL command to extract a Pandas dataframe from our database. We use the LEFT JOIN
keyword to merge the three sources of data, and the WHERE
to specify the conditions (here, a country, a range of years, and a specific month).
def query_climate_database(country, year_begin, year_end, month):
conn = sqlite3.connect(data_path +"temps.db")
cursor = conn.cursor()
cmd = \
"""
SELECT S.name, S.latitude, S.longitude, C.name, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON C.'FIPS 10-4' = S.'FIPS 10-4'
WHERE (T.year >= """ + str(year_begin) + """ AND T.year <= """+str(year_end)+""")
AND T.month ="""+ str(month) +""" AND C.Name = '""" +country+"""'"""
df = pd.read_sql_query(cmd, conn)
return df
india_df = query_climate_database(country = "India",
year_begin = 1980,
year_end = 2020,
month = 1)
india_df.head()
NAME | LATITUDE | LONGITUDE | Name | Year | Month | Temp | |
---|---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1980 | 1 | 23.48 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1981 | 1 | 24.57 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1982 | 1 | 24.19 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1983 | 1 | 23.51 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1984 | 1 | 24.81 |
3. Creating a geographic scatterplot
By extracting relevant subsets of our database, we can see how average yearly changes in temperature vary within a given country.
We first define a function to perform a linear regression of the temperature on the year, and return the coefficients.
def coef(data_group):
'''
perform linear regression on a dataframe with year and temp.
returns the coefficients
'''
x = data_group[["Year"]]
y = data_group["Temp"]
LR = LinearRegression()
LR.fit(x, y)
return LR.coef_[0]
Now, we can write a function to plot the temperature coefficients in a geographic scatterplot using plotly
’s scatter_mapbox
. The coefficients are represented as the color.
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs,
mapbox_style="carto-positron",
zoom = 2,
color_continuous_scale=px.colors.diverging.RdGy_r,
opacity=1,
**kwargs):
'''
plot temperature coefficients in a geographic scatterplot.
'''
df = query_climate_database(country, year_begin, year_end, month)
coefs = df.groupby(["NAME"]).apply(coef)
coefs = coefs.reset_index()
coefs = coefs.rename(columns={0:"Estimated yearly increase (\N{DEGREE SIGN}C)"})
min_filter = df.groupby(["NAME"]).count()["Name"]>=min_obs
coords = df.groupby(["NAME"]).first(min_count=min_obs)[["LATITUDE","LONGITUDE"]][min_filter]
x = pd.merge(coefs,coords,on=["NAME"])
title = "Estimates of yearly increase in temperature in "\
+ month_name[month] + " for stations in " + country + ", years " + str(year_begin) + "-" + str(year_end)
fig = px.scatter_mapbox(x,
lat = "LATITUDE",
lon = "LONGITUDE",
hover_name = "NAME",
color = "Estimated yearly increase (\N{DEGREE SIGN}C)",
hover_data={
"LATITUDE":':.3f',
"LONGITUDE":':.3f',
"Estimated yearly increase (\N{DEGREE SIGN}C)":':.3f'
},
title = title,
zoom = zoom,
color_continuous_scale=color_continuous_scale,
mapbox_style = mapbox_style,
color_continuous_midpoint=0,
opacity=opacity
)
return fig
fig=temperature_coefficient_plot("India", 1980, 2020, 1, 10)
fig.show()
fig = temperature_coefficient_plot("United Kingdom", 1980, 2020, 1, 10, zoom=3)
fig.show()
4a
Plotting within-year temperature differences
Different locations exhibit different seasonal differences in temperatures. Generally, coastal places have milder winters and summers, so they tend to have a narrower within-year temperature range. We can use another geographical scatterplot to better understand how the yearly temperature range vary across a country’s geography.
def query_country_year(country, year):
'''
returns dataframe of all temperatures in a country in a single year.
'''
conn = sqlite3.connect(data_path +"temps.db")
cursor = conn.cursor()
cmd = \
"""
SELECT S.name, S.latitude, S.longitude, C.name, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON C.'FIPS 10-4' = S.'FIPS 10-4'
WHERE (T.year == """ + str(year)+""" AND C.Name = '""" +country+"""')"""
df = pd.read_sql_query(cmd, conn)
return df
def year_diff(df):
'''
computes temperature difference between hottest and coldest month
'''
df_group = df.groupby(["NAME"])
obs_filter = df_group.count()["Name"]>=12
temp_diff = (df_group.max()[['Temp']] - df_group.min()[['Temp']])[obs_filter]
coords = df.groupby(["NAME"]).first()[["LATITUDE","LONGITUDE"]][obs_filter]
x = pd.merge(temp_diff,coords,on='NAME')
x = x.rename(columns={'Temp':"Temperature difference"})
x = x.reset_index()
return x
We’ve defined query_country_year
to extract the relevant data, and pass the output to year_diff
to calculate the difference between the average temperature in the hottest month and the coldest month, at the station level. The output can be used in seasonal_difference_plot
below.
def seasonal_difference_plot(country, year,
mapbox_style="carto-positron",
color_continuous_scale=px.colors.sequential.dense,
zoom=2,
**kwargs):
x = year_diff(query_country_year(country, year))
title = "Difference in temperature between hottest and coolest month in "\
+ str(year) + " for stations in " + country
fig = px.scatter_mapbox(x,
lat = "LATITUDE",
lon = "LONGITUDE",
hover_name = "NAME",
color = "Temperature difference",
hover_data={
"LATITUDE":':.3f',
"LONGITUDE":':.3f',
"Temperature difference":':.3f'
},
title = title,
zoom=zoom,
color_continuous_scale=color_continuous_scale,
mapbox_style = mapbox_style,
**kwargs
)
return fig
fig = seasonal_difference_plot('Russia', 2020,
mapbox_style="carto-positron",
zoom=1)
fig.show()
fig = seasonal_difference_plot('Australia', 1970)
fig.show()
In the first plot, we can see how European Russia is relatively mild compared to the extreme temperature ranges experienced in Siberia. The second plot shows the contrast between coastal and inland Australia as ocean’s heat capacity buffers the seasonal variations in temperature.
4b
Seasonal line graphs by decade
Having explored seasonal temperature ranges, perhaps we would like to visualize how it evolves across time. We will attempt to produce a plot that visualizes the trend across two time scales - (i) seasonality within a year, and (ii) long-term trend across decades.
def query_station(station):
'''
extract all data for a given station
'''
conn = sqlite3.connect(data_path +"temps.db")
cursor = conn.cursor()
cmd = \
"""
SELECT S.name, S.latitude, S.longitude, C.name, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON C.'FIPS 10-4' = S.'FIPS 10-4'
WHERE S.name = '""" +station+"""'"""
df = pd.read_sql_query(cmd, conn)
return df
def time_trends(df):
'''
identify decades and compute mean temperatures by decade and month
'''
df['decade'] = df[['Year']]//10 * 10
#only keep decades with at least 80 observations
obs = df.groupby(['decade']).transform(np.size)['Year'] >= 80
df = df.loc[obs,]
g = df.groupby(['Month','decade']).mean()
g = g.reset_index()
return g
Having created a column for the decade of each observation, we can now plot the mean seasonal trend for each decade in facets. Facets are subplots for different subsets of the data. Setting the option facet_col='decade'
lets us plot each decade in separate columns, while the shared y-axis allows us to compare the trend across decades.
def plot_time_trends(station, **kwargs):
'''plot time trends in decade facets'''
z = time_trends(query_station(station))
fig = px.line(z, x='Month', y='Temp', facet_col='decade',
title = "Seasonal temperature trend by decade in station: "+station,
**kwargs)
return fig
fig = plot_time_trends('HEATHROW')
fig.show()
We can see that the peak monthly mean temperatures have gone up in London Heathrow in recent decades. With a slight modification, we can also combine the decades into the same graph, identifying each decade by its color.
def plot_combined_decades(station, color_discrete_sequence= px.colors.sequential.Oranges, **kwargs):
'''plot time trends, labelling decades with colors'''
z = time_trends(query_station(station))
fig = px.line(z, x="Month", y="Temp", color='decade',
title = "Seasonal temperature trend by decade in station: "+station,
color_discrete_sequence=color_discrete_sequence,
**kwargs)
return fig
fig = plot_combined_decades('ST_PETERSBURG',
color_discrete_sequence= px.colors.sequential.Plotly3_r,
template='plotly_white')
fig.show()