SQL in Data Science
- SQL in Data Science - The Basics using Python
- SQL in Data Science - Slightly more Advanced Queries
- SQL in Data Science - Machine Learning
SQL and Python
Subqueries
Use Subqueries to trim down the dataset before running your search by a filter condition from a different SQL table. For this I split up the Penguins dataset used in part 1 into two tables:
Create Table from CSV
Table A
uid | species | island | sex | |
---|---|---|---|---|
0 | 649bc5d39b21a1b93e0985a3 | Adelie | Torgersen | MALE |
1 | 649bc5d39b21a1b93e0985a4 | Adelie | Torgersen | FEMALE |
2 | 649bc5d39b21a1b93e0985a5 | Adelie | Torgersen | FEMALE |
3 | 649bc5d39b21a1b93e0985a7 | Adelie | Torgersen | FEMALE |
4 | 649bc5d39b21a1b93e0985a8 | Adelie | Torgersen | MALE |
... |
Table B
uid | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | |
---|---|---|---|---|---|
0 | 649bc5d39b21a1b93e0985a3 | 39.1 | 18.7 | 181 | 3750 |
1 | 649bc5d39b21a1b93e0985a4 | 39.5 | 17.4 | 186 | 3800 |
2 | 649bc5d39b21a1b93e0985a5 | 40.3 | 18.0 | 195 | 3250 |
3 | 649bc5d39b21a1b93e0985a7 | 36.7 | 19.3 | 193 | 3450 |
4 | 649bc5d39b21a1b93e0985a8 | 39.3 | 20.6 | 190 | 3650 |
... |
Run the following functions for both CSV files, respectively:
data_a = pd.read_csv ('dataset_penguins_a.csv')
df_a = pd.DataFrame(data_a)
def create_table():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# create the table
cursor.execute('''
CREATE TABLE penguins_a (
uid CHAR(25) primary key,
species VARCHAR(10),
island VARCHAR(10),
sex VARCHAR(10)
)
''')
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
def add_data():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# insert data
for row in df_a.itertuples():
cursor.execute('INSERT INTO penguins_a (uid, species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (row.uid, row.species, row.island, row.culmen_length_mm, row.culmen_depth_mm, row.flipper_length_mm, row.body_mass_g, row.sex))
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
SELECT WHERE Subquery
We can now query against table A after fulfilling a filter condition from table B:
def get_penguins_subquery(unit, condition):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT uid, island, species FROM penguins_a WHERE uid IN (SELECT uid FROM penguins_b WHERE {0} {1})".format(unit, condition)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# return specimen that weight more than 6kg
results = get_penguins_subquery(unit='body_mass_g', condition='>6000')
print(results)
UID | Island | Species | |
---|---|---|---|
0 | 649bc5d39b21a1b93e098690 | Biscoe | Gentoo |
1 | 649bc5d39b21a1b93e0986a0 | Biscoe | Gentoo |
Joins
INNER JOIN
def get_penguins_inner_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM penguins_a INNER JOIN penguins_b ON penguins_a.uid = penguins_b.uid)".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_inner_join(column1='species', column2='island', column3='body_mass_g')
print(results)
Species | Island | Body Mass | |
---|---|---|---|
0 | Adelie | Torgersen | 3750 |
1 | Adelie | Torgersen | 3800 |
2 | Adelie | Torgersen | 3250 |
3 | Adelie | Torgersen | 3450 |
4 | Adelie | Torgersen | 3650 |
... |
JOIN Aliases
def get_penguins_alias_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM penguins_a AS a, penguins_b AS b WHERE a.uid = b.uid".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_alias_join(column1='species', column2='island', column3='body_mass_g')
print(results)
Species | Island | Body Mass | |
---|---|---|---|
0 | Adelie | Torgersen | 3750 |
1 | Adelie | Torgersen | 3800 |
2 | Adelie | Torgersen | 3250 |
3 | Adelie | Torgersen | 3450 |
4 | Adelie | Torgersen | 3650 |
... |
Multi INNER JOIN
def get_penguins_multi_inner_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM ((penguins_a pa INNER JOIN penguins_b pb ON pa.uid = pb.uid) INNER JOIN penguins p ON pa.uid = p.uid)".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# return species from table A, island from the original table and body mass from table B
results = get_penguins_multi_inner_join(column1='pa.species', column2='p.island', column3='pb.body_mass_g')
print(results)
Species | Island | Body Mass | |
---|---|---|---|
0 | Adelie | Torgersen | 3750 |
1 | Adelie | Torgersen | 3800 |
2 | Adelie | Torgersen | 3250 |
3 | Adelie | Torgersen | 3450 |
4 | Adelie | Torgersen | 3650 |
... |
Self JOIN
AND statement for a second column to join on... but my tables only have one index in common... ordered by body mass:
def get_penguins_self_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0} AS Species, {1} AS Island, {2} AS Mass FROM penguins_a a, penguins_b b WHERE a.uid = b.uid AND a.uid = b.uid ORDER BY b.body_mass_g".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_self_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
df.head()
Species | Island | Body Mass | |
---|---|---|---|
0 | Chinstrap | Dream | 2700 |
1 | Adelie | Biscoe | 2850 |
2 | Adelie | Biscoe | 2850 |
3 | Adelie | Torgersen | 2900 |
4 | Adelie | Biscoe | 2900 |
... |
LEFT JOIN
def get_penguins_left_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM penguins_a a LEFT JOIN penguins_b b ON a.uid = b.uid".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_left_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
df.head()
Species | Island | Body Mass | |
---|---|---|---|
0 | Adelie | Torgersen | 3750 |
1 | Adelie | Torgersen | 3800 |
2 | Adelie | Torgersen | 3250 |
3 | Adelie | Torgersen | 3450 |
4 | Adelie | Torgersen | 3650 |
... |
There is no difference between right and left joins - they all return the same as a full outer join as both tables contain the same uid
rows. There will be a difference once one row is missing in the other table - then it will either be dropped or all values from that table be listed as NULL depending on you approaching it from the left or the right.
FULL OUTER JOIN
def get_penguins_full_join(column1, column2, column3):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {2} FROM penguins_a a FULL OUTER JOIN penguins_b b ON a.uid = b.uid".format(column1, column2, column3)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# return species and island from table A and body mass from table B
results = get_penguins_full_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
df.head()
Species | Island | Body Mass | |
---|---|---|---|
0 | Adelie | Torgersen | 3750 |
1 | Adelie | Torgersen | 3800 |
2 | Adelie | Torgersen | 3250 |
3 | Adelie | Torgersen | 3450 |
4 | Adelie | Torgersen | 3650 |
... |
UNION
UNION does not really work here. It could be used if another survey comes in and you want all the ID that are associated to specimens of a group - species, island, sex... Select uid
in both tables and filter for the group you are looking for - this returns all the IDs from both tables:
def get_penguins_union():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT uid FROM penguins_a1 WHERE island='Dream' UNION SELECT uid FROM penguins_a2 WHERE island='Dream'"
print(query)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
Formating
Text Strings - Concatenations
def get_penguins_concat_strings(column1, column2, filter):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, {0} || ' (' || {1} || ')' FROM penguins_a WHERE sex = '{2}'".format(column1, column2, filter)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_concat_strings(column1='species', column2='island', filter='MALE')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Concat'])
df.head()
Species | Island | Concat | |
---|---|---|---|
0 | Adelie | Torgersen | Adelie (Torgersen) |
1 | Adelie | Torgersen | Adelie (Torgersen) |
2 | Adelie | Torgersen | Adelie (Torgersen) |
3 | Adelie | Torgersen | Adelie (Torgersen) |
4 | Adelie | Torgersen | Adelie (Torgersen) |
Text Strings - Trim
TRIM
,RTRIM
,LTRIM
to remove leading or trailing spaces from a string.
SELECT TRIM('#! ' FROM ' #SQL Tutorial! ') AS TrimmedString;
TrimmedString |
---|
SQL Tutorial |
Text Strings - SUBSTR, LOWER, UPPER
Generate IDs from Substrings - e.g. Island Codes:
def get_penguins_sub_strings(column1, column2, filter):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, {1}, UPPER(SUBSTR({1},1,3)) FROM penguins_a".format(column1, column2, filter)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_sub_strings(column1='species', column2='island')
df = pd.DataFrame(results, columns=['Species', 'Island', 'Location Code'])
df.head()
Species | Island | Location Code | |
---|---|---|---|
0 | Adelie | Torgersen | TOR |
1 | Adelie | Torgersen | TOR |
2 | Adelie | Torgersen | TOR |
3 | Adelie | Torgersen | TOR |
4 | Adelie | Torgersen | TOR |
Text Strings - STRFTIME
The penguins dataset does not contain timestamps ~ let's import another one that does:
tweet_data = pd.read_csv ('datasets_tweets.csv')
df = pd.DataFrame(tweet_data)
df = df.drop(['negativereason', 'negativereason_confidence', 'text', 'tweet_location', 'tweet_coord', 'negativereason_gold', 'airline_sentiment_gold'], axis=1)
df = df.drop_duplicates(subset=['tweet_id'], keep='first')
def create_table():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# create the table
cursor.execute('''
CREATE TABLE tweets (
tweet_id INT primary key,
airline_sentiment VARCHAR,
airline_sentiment_confidence FLOAT,
airline VARCHAR,
name VARCHAR,
retweet_count INT,
tweet_created TIMESTAMP,
user_timezone VARCHAR
)
''')
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
def add_data():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# insert data
for row in df.itertuples():
cursor.execute('INSERT INTO tweets (tweet_id, airline_sentiment, airline_sentiment_confidence, airline, name, retweet_count, tweet_created, user_timezone) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (row.tweet_id, row.airline_sentiment, row.airline_sentiment_confidence, row.airline, row.name ,row.retweet_count, row.tweet_created, row.user_timezone))
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
tweet_id | airline_sentiment | airline_sentiment_confidence | airline | name | retweet_count | tweet_created | user_timezone | |
---|---|---|---|---|---|---|---|---|
0 | 570306133677760513 | neutral | 1.0000 | Virgin America | cairdin | 0 | 2015-02-24 11:35:52 -0800 | Eastern Time (US & Canada) |
1 | 570301130888122368 | positive | 0.3486 | Virgin America | jnardino | 0 | 2015-02-24 11:15:59 -0800 | Pacific Time (US & Canada) |
2 | 570301083672813571 | neutral | 0.6837 | Virgin America | yvonnalynn | 0 | 2015-02-24 11:15:48 -0800 | Central Time (US & Canada) |
3 | 570301031407624196 | negative | 1.0000 | Virgin America | jnardino | 0 | 2015-02-24 11:15:36 -0800 | Pacific Time (US & Canada) |
4 | 570300817074462722 | negative | 1.0000 | Virgin America | jnardino | 0 | 2015-02-24 11:14:45 -0800 | Pacific Time (US & Canada) |
def get_tweets_datetime_strings():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT tweet_created, EXTRACT(ISOYEAR FROM tweet_created), EXTRACT(MONTH FROM tweet_created), EXTRACT(Day FROM tweet_created), EXTRACT(DOW FROM tweet_created), CURRENT_DATE - tweet_created FROM tweets;"
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_tweets_datetime_strings()
df = pd.DataFrame(results, columns=['Timestamp', 'Year', 'Month', 'Day', 'Day of the Week', 'Days Since'])
df.head()
Timestamp | Year | Month | Day | Day of the Week | Days Since | |
---|---|---|---|---|---|---|
0 | 2015-02-24 11:35:52 | 2015 | 2 | 24 | 2 | 3048 days 12:24:08 |
1 | 2015-02-24 11:15:59 | 2015 | 2 | 24 | 2 | 3048 days 12:44:01 |
2 | 2015-02-24 11:15:48 | 2015 | 2 | 24 | 2 | 3048 days 12:44:12 |
3 | 2015-02-24 11:15:36 | 2015 | 2 | 24 | 2 | 3048 days 12:44:24 |
4 | 2015-02-24 11:14:45 | 2015 | 2 | 24 | 2 | 3048 days 12:45:15 |
Case Statements
CASE WHEN THEN ELSE - One-Hot Encoding
def get_penguins_binary_classification():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT airline, airline_sentiment, CASE airline_sentiment WHEN 'negative' THEN 0 ELSE 1 END negative FROM tweets"
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_binary_classification()
df = pd.DataFrame(results, columns=['Airline', 'Sentiment', 'Binary Sentiment'])
df.head()
Airline | Sentiment | Binary Sentiment | |
---|---|---|---|
0 | Virgin America | neutral | 1 |
1 | Virgin America | positive | 1 |
2 | Virgin America | neutral | 1 |
3 | Virgin America | negative | 0 |
4 | Virgin America | negative | 0 |
CASE WHEN AND THEN ELSE - Multi-Classification
def get_penguins_multi_classification():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT species, island, sex, CASE WHEN body_mass_g < 3000 THEN 'A' WHEN body_mass_g >= 3001 AND body_mass_g <=6000 THEN 'B' WHEN body_mass_g >= 6000 THEN 'C' ELSE 'something went wrong' END mass_class FROM penguins"
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_multi_classification()
df = pd.DataFrame(results, columns=['Species', 'Island', 'Sex', 'Mass Class'])
df.head()
Species | Island | Sex | Mass Class | |
---|---|---|---|---|
0 | Adelie | Torgersen | MALE | B |
1 | Adelie | Torgersen | FEMALE | B |
2 | Adelie | Torgersen | FEMALE | B |
3 | Adelie | Torgersen | FEMALE | B |
4 | Adelie | Torgersen | MALE | B |