Skip to main content

Guangzhou, China

Github Repository

SQL in Data Science

  1. SQL in Data Science - The Basics using Python
  2. SQL in Data Science - Slightly more Advanced Queries
  3. SQL in Data Science - Machine Learning

SQL and Python

Subqueries

Use Subqueries to trim down the dataset before running your search by a filter condition from a different SQL table. For this I split up the Penguins dataset used in part 1 into two tables:

Create Table from CSV

Table A

uidspeciesislandsex
0649bc5d39b21a1b93e0985a3AdelieTorgersenMALE
1649bc5d39b21a1b93e0985a4AdelieTorgersenFEMALE
2649bc5d39b21a1b93e0985a5AdelieTorgersenFEMALE
3649bc5d39b21a1b93e0985a7AdelieTorgersenFEMALE
4649bc5d39b21a1b93e0985a8AdelieTorgersenMALE
...

Table B

uidculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_g
0649bc5d39b21a1b93e0985a339.118.71813750
1649bc5d39b21a1b93e0985a439.517.41863800
2649bc5d39b21a1b93e0985a540.318.01953250
3649bc5d39b21a1b93e0985a736.719.31933450
4649bc5d39b21a1b93e0985a839.320.61903650
...

Run the following functions for both CSV files, respectively:

data_a = pd.read_csv ('dataset_penguins_a.csv')   
df_a = pd.DataFrame(data_a)


def create_table():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# create the table
cursor.execute('''
CREATE TABLE penguins_a (
uid CHAR(25) primary key,
species VARCHAR(10),
island VARCHAR(10),
sex VARCHAR(10)
)
''')
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
def add_data():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# insert data
for row in df_a.itertuples():
cursor.execute('INSERT INTO penguins_a (uid, species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (row.uid, row.species, row.island, row.culmen_length_mm, row.culmen_depth_mm, row.flipper_length_mm, row.body_mass_g, row.sex))
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)

SELECT WHERE Subquery

We can now query against table A after fulfilling a filter condition from table B:

def get_penguins_subquery(unit, condition):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT uid, island, species FROM penguins_a WHERE uid IN (SELECT uid FROM penguins_b WHERE {0} {1})".format(unit, condition)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# return specimen that weight more than 6kg
results = get_penguins_subquery(unit='body_mass_g', condition='>6000')
print(results)
UIDIslandSpecies
0649bc5d39b21a1b93e098690BiscoeGentoo
1649bc5d39b21a1b93e0986a0BiscoeGentoo

Joins

INNER JOIN

def get_penguins_inner_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM penguins_a INNER JOIN penguins_b ON penguins_a.uid = penguins_b.uid)".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_inner_join(column1='species', column2='island', column3='body_mass_g')
print(results)
SpeciesIslandBody Mass
0AdelieTorgersen3750
1AdelieTorgersen3800
2AdelieTorgersen3250
3AdelieTorgersen3450
4AdelieTorgersen3650
...

JOIN Aliases

def get_penguins_alias_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM penguins_a AS a, penguins_b AS b WHERE a.uid = b.uid".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_alias_join(column1='species', column2='island', column3='body_mass_g')
print(results)
SpeciesIslandBody Mass
0AdelieTorgersen3750
1AdelieTorgersen3800
2AdelieTorgersen3250
3AdelieTorgersen3450
4AdelieTorgersen3650
...

Multi INNER JOIN

def get_penguins_multi_inner_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM ((penguins_a pa INNER JOIN penguins_b pb ON pa.uid = pb.uid) INNER JOIN penguins p ON pa.uid = p.uid)".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# return species from table A, island from the original table and body mass from table B
results = get_penguins_multi_inner_join(column1='pa.species', column2='p.island', column3='pb.body_mass_g')
print(results)
SpeciesIslandBody Mass
0AdelieTorgersen3750
1AdelieTorgersen3800
2AdelieTorgersen3250
3AdelieTorgersen3450
4AdelieTorgersen3650
...

Self JOIN

AND statement for a second column to join on... but my tables only have one index in common... ordered by body mass:

def get_penguins_self_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0} AS Species, {1} AS Island, {2} AS Mass FROM penguins_a a, penguins_b b WHERE a.uid = b.uid AND a.uid = b.uid ORDER BY b.body_mass_g".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_self_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
df.head()
SpeciesIslandBody Mass
0ChinstrapDream2700
1AdelieBiscoe2850
2AdelieBiscoe2850
3AdelieTorgersen2900
4AdelieBiscoe2900
...

LEFT JOIN

def get_penguins_left_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM penguins_a a LEFT JOIN penguins_b b ON a.uid = b.uid".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_left_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
df.head()
SpeciesIslandBody Mass
0AdelieTorgersen3750
1AdelieTorgersen3800
2AdelieTorgersen3250
3AdelieTorgersen3450
4AdelieTorgersen3650
...

There is no difference between right and left joins - they all return the same as a full outer join as both tables contain the same uid rows. There will be a difference once one row is missing in the other table - then it will either be dropped or all values from that table be listed as NULL depending on you approaching it from the left or the right.

SQL Joins

FULL OUTER JOIN

def get_penguins_full_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM penguins_a a FULL OUTER JOIN penguins_b b ON a.uid = b.uid".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_full_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
df.head()
SpeciesIslandBody Mass
0AdelieTorgersen3750
1AdelieTorgersen3800
2AdelieTorgersen3250
3AdelieTorgersen3450
4AdelieTorgersen3650
...

UNION

UNION does not really work here. It could be used if another survey comes in and you want all the ID that are associated to specimens of a group - species, island, sex... Select uid in both tables and filter for the group you are looking for - this returns all the IDs from both tables:

def get_penguins_union():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT uid FROM penguins_a1 WHERE island='Dream' UNION SELECT uid FROM penguins_a2 WHERE island='Dream'"
print(query)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()

Formating

Text Strings - Concatenations

def get_penguins_concat_strings(column1, column2, filter):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {0} || ' (' || {1} || ')' FROM penguins_a WHERE sex = '{2}'".format(column1, column2, filter)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_concat_strings(column1='species', column2='island', filter='MALE')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Concat'])
df.head()
SpeciesIslandConcat
0AdelieTorgersenAdelie (Torgersen)
1AdelieTorgersenAdelie (Torgersen)
2AdelieTorgersenAdelie (Torgersen)
3AdelieTorgersenAdelie (Torgersen)
4AdelieTorgersenAdelie (Torgersen)

Text Strings - Trim

TRIM, RTRIM, LTRIM to remove leading or trailing spaces from a string.

SELECT TRIM('#! ' FROM '    #SQL Tutorial!    ') AS TrimmedString;
TrimmedString
SQL Tutorial

Text Strings - SUBSTR, LOWER, UPPER

Generate IDs from Substrings - e.g. Island Codes:

def get_penguins_sub_strings(column1, column2, filter):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, UPPER(SUBSTR({1},1,3)) FROM penguins_a".format(column1, column2, filter)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_sub_strings(column1='species', column2='island')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Location Code'])
df.head()
SpeciesIslandLocation Code
0AdelieTorgersenTOR
1AdelieTorgersenTOR
2AdelieTorgersenTOR
3AdelieTorgersenTOR
4AdelieTorgersenTOR

Text Strings - STRFTIME

The penguins dataset does not contain timestamps ~ let's import another one that does:

tweet_data = pd.read_csv ('datasets_tweets.csv')   
df = pd.DataFrame(tweet_data)
df = df.drop(['negativereason', 'negativereason_confidence', 'text', 'tweet_location', 'tweet_coord', 'negativereason_gold', 'airline_sentiment_gold'], axis=1)
df = df.drop_duplicates(subset=['tweet_id'], keep='first')


def create_table():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# create the table
cursor.execute('''
CREATE TABLE tweets (
tweet_id INT primary key,
airline_sentiment VARCHAR,
airline_sentiment_confidence FLOAT,
airline VARCHAR,
name VARCHAR,
retweet_count INT,
tweet_created TIMESTAMP,
user_timezone VARCHAR
)
''')
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
def add_data():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# insert data
for row in df.itertuples():
cursor.execute('INSERT INTO tweets (tweet_id, airline_sentiment, airline_sentiment_confidence, airline, name, retweet_count, tweet_created, user_timezone) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (row.tweet_id, row.airline_sentiment, row.airline_sentiment_confidence, row.airline, row.name ,row.retweet_count, row.tweet_created, row.user_timezone))
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
tweet_idairline_sentimentairline_sentiment_confidenceairlinenameretweet_counttweet_createduser_timezone
0570306133677760513neutral1.0000Virgin Americacairdin02015-02-24 11:35:52 -0800Eastern Time (US & Canada)
1570301130888122368positive0.3486Virgin Americajnardino02015-02-24 11:15:59 -0800Pacific Time (US & Canada)
2570301083672813571neutral0.6837Virgin Americayvonnalynn02015-02-24 11:15:48 -0800Central Time (US & Canada)
3570301031407624196negative1.0000Virgin Americajnardino02015-02-24 11:15:36 -0800Pacific Time (US & Canada)
4570300817074462722negative1.0000Virgin Americajnardino02015-02-24 11:14:45 -0800Pacific Time (US & Canada)
def get_tweets_datetime_strings():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT tweet_created, EXTRACT(ISOYEAR FROM tweet_created), EXTRACT(MONTH FROM tweet_created), EXTRACT(Day FROM tweet_created), EXTRACT(DOW FROM tweet_created), CURRENT_DATE - tweet_created FROM tweets;"
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_tweets_datetime_strings()
df = pd.DataFrame(results, columns=['Timestamp', 'Year', 'Month', 'Day', 'Day of the Week', 'Days Since'])
df.head()
TimestampYearMonthDayDay of the WeekDays Since
02015-02-24 11:35:52201522423048 days 12:24:08
12015-02-24 11:15:59201522423048 days 12:44:01
22015-02-24 11:15:48201522423048 days 12:44:12
32015-02-24 11:15:36201522423048 days 12:44:24
42015-02-24 11:14:45201522423048 days 12:45:15

Case Statements

CASE WHEN THEN ELSE - One-Hot Encoding

def get_penguins_binary_classification():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT airline, airline_sentiment, CASE airline_sentiment WHEN 'negative' THEN 0 ELSE 1 END negative FROM tweets"
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_binary_classification()
df = pd.DataFrame(results, columns=['Airline', 'Sentiment', 'Binary Sentiment'])
df.head()
AirlineSentimentBinary Sentiment
0Virgin Americaneutral1
1Virgin Americapositive1
2Virgin Americaneutral1
3Virgin Americanegative0
4Virgin Americanegative0

CASE WHEN AND THEN ELSE - Multi-Classification

def get_penguins_multi_classification():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT species, island, sex, CASE WHEN body_mass_g < 3000 THEN 'A' WHEN body_mass_g >= 3001 AND body_mass_g <=6000 THEN 'B' WHEN body_mass_g >= 6000 THEN 'C' ELSE 'something went wrong' END mass_class FROM penguins"
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_multi_classification()
df = pd.DataFrame(results, columns=['Species', 'Island', 'Sex', 'Mass Class'])
df.head()
SpeciesIslandSexMass Class
0AdelieTorgersenMALEB
1AdelieTorgersenFEMALEB
2AdelieTorgersenFEMALEB
3AdelieTorgersenFEMALEB
4AdelieTorgersenMALEB