Guangzhou, China

SQL in Data Science
- SQL and Python

SQL in Data Science

SQL and Python

Subqueries

Use Subqueries to trim down the dataset before running your search by a filter condition from a different SQL table. For this I split up the Penguins dataset used in part 1 into two tables:

Create Table from CSV

Table A

	uid	species	island	sex
0	649bc5d39b21a1b93e0985a3	Adelie	Torgersen	MALE
1	649bc5d39b21a1b93e0985a4	Adelie	Torgersen	FEMALE
2	649bc5d39b21a1b93e0985a5	Adelie	Torgersen	FEMALE
3	649bc5d39b21a1b93e0985a7	Adelie	Torgersen	FEMALE
4	649bc5d39b21a1b93e0985a8	Adelie	Torgersen	MALE
...

Table B

	uid	culmen_length_mm	culmen_depth_mm	flipper_length_mm	body_mass_g
0	649bc5d39b21a1b93e0985a3	39.1	18.7	181	3750
1	649bc5d39b21a1b93e0985a4	39.5	17.4	186	3800
2	649bc5d39b21a1b93e0985a5	40.3	18.0	195	3250
3	649bc5d39b21a1b93e0985a7	36.7	19.3	193	3450
4	649bc5d39b21a1b93e0985a8	39.3	20.6	190	3650
...

Run the following functions for both CSV files, respectively:

data_a = pd.read_csv ('dataset_penguins_a.csv')   
df_a = pd.DataFrame(data_a)


def create_table():
    conn = None
    try:
        # read database configuration
        params = config()
        # connect to the PostgreSQL database
        conn = psycopg2.connect(**params)
        # create a new cursor
        cursor = conn.cursor()
        # create the table
        cursor.execute('''
		CREATE TABLE penguins_a (
			uid CHAR(25) primary key,
			species VARCHAR(10),
            island VARCHAR(10),
			sex VARCHAR(10)
			)
               ''')
        conn.commit()
        # Close communication with the PostgreSQL database
        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

def add_data():
    conn = None
    try:
        # read database configuration
        params = config()
        # connect to the PostgreSQL database
        conn = psycopg2.connect(**params)
        # create a new cursor
        cursor = conn.cursor()
        # insert data
        for row in df_a.itertuples():
            cursor.execute('INSERT INTO penguins_a (uid, species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (row.uid, row.species, row.island, row.culmen_length_mm, row.culmen_depth_mm, row.flipper_length_mm, row.body_mass_g, row.sex))
        # Commit the changes to the database
        conn.commit()
        # Close communication with the PostgreSQL database
        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

SELECT WHERE Subquery

We can now query against table A after fulfilling a filter condition from table B:

def get_penguins_subquery(unit, condition):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT uid, island, species FROM penguins_a WHERE uid IN (SELECT uid FROM penguins_b WHERE {0} {1})".format(unit, condition)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    # return specimen that weight more than 6kg
    results = get_penguins_subquery(unit='body_mass_g', condition='>6000')
    print(results)

	UID	Island	Species
0	649bc5d39b21a1b93e098690	Biscoe	Gentoo
1	649bc5d39b21a1b93e0986a0	Biscoe	Gentoo

Joins

INNER JOIN

def get_penguins_inner_join(column1, column2, column3):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT {0}, {1}, {2} FROM penguins_a INNER JOIN penguins_b ON penguins_a.uid = penguins_b.uid)".format(column1, column2, column3)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    # return species and island from table A and body mass from table B
    results = get_penguins_inner_join(column1='species', column2='island', column3='body_mass_g')
    print(results)

	Species	Island	Body Mass
0	Adelie	Torgersen	3750
1	Adelie	Torgersen	3800
2	Adelie	Torgersen	3250
3	Adelie	Torgersen	3450
4	Adelie	Torgersen	3650
...

JOIN Aliases

def get_penguins_alias_join(column1, column2, column3):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT {0}, {1}, {2} FROM penguins_a AS a, penguins_b AS b WHERE a.uid = b.uid".format(column1, column2, column3)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    # return species and island from table A and body mass from table B
    results = get_penguins_alias_join(column1='species', column2='island', column3='body_mass_g')
    print(results)

	Species	Island	Body Mass
0	Adelie	Torgersen	3750
1	Adelie	Torgersen	3800
2	Adelie	Torgersen	3250
3	Adelie	Torgersen	3450
4	Adelie	Torgersen	3650
...

Multi INNER JOIN

def get_penguins_multi_inner_join(column1, column2, column3):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT {0}, {1}, {2} FROM ((penguins_a pa INNER JOIN penguins_b pb ON pa.uid = pb.uid) INNER JOIN penguins p ON  pa.uid = p.uid)".format(column1, column2, column3)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    # return species from table A, island from the original table and body mass from table B
    results = get_penguins_multi_inner_join(column1='pa.species', column2='p.island', column3='pb.body_mass_g')
    print(results)

	Species	Island	Body Mass
0	Adelie	Torgersen	3750
1	Adelie	Torgersen	3800
2	Adelie	Torgersen	3250
3	Adelie	Torgersen	3450
4	Adelie	Torgersen	3650
...

Self JOIN

AND statement for a second column to join on... but my tables only have one index in common... ordered by body mass:

def get_penguins_self_join(column1, column2, column3):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT {0} AS Species, {1} AS Island, {2} AS Mass FROM penguins_a a, penguins_b b WHERE a.uid = b.uid AND a.uid = b.uid ORDER BY b.body_mass_g".format(column1, column2, column3)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    # return species and island from table A and body mass from table B
    results = get_penguins_self_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
    df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
    df.head()

	Species	Island	Body Mass
0	Chinstrap	Dream	2700
1	Adelie	Biscoe	2850
2	Adelie	Biscoe	2850
3	Adelie	Torgersen	2900
4	Adelie	Biscoe	2900
...

LEFT JOIN

def get_penguins_left_join(column1, column2, column3):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT {0}, {1}, {2} FROM penguins_a a LEFT JOIN penguins_b b ON a.uid = b.uid".format(column1, column2, column3)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    # return species and island from table A and body mass from table B
    results = get_penguins_left_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
    df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
    df.head()

	Species	Island	Body Mass
0	Adelie	Torgersen	3750
1	Adelie	Torgersen	3800
2	Adelie	Torgersen	3250
3	Adelie	Torgersen	3450
4	Adelie	Torgersen	3650
...

There is no difference between right and left joins - they all return the same as a full outer join as both tables contain the same uid rows. There will be a difference once one row is missing in the other table - then it will either be dropped or all values from that table be listed as NULL depending on you approaching it from the left or the right.

SQL Joins

FULL OUTER JOIN

def get_penguins_full_join(column1, column2, column3):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT {0}, {1}, {2} FROM penguins_a a FULL OUTER JOIN penguins_b b ON a.uid = b.uid".format(column1, column2, column3)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    # return species and island from table A and body mass from table B
    results = get_penguins_full_join(column1='a.species', column2='a.island', column3='b.body_mass_g')
    df = pd.DataFrame(results, columns=['Species', 'Island', 'Body Mass'])
    df.head()

	Species	Island	Body Mass
0	Adelie	Torgersen	3750
1	Adelie	Torgersen	3800
2	Adelie	Torgersen	3250
3	Adelie	Torgersen	3450
4	Adelie	Torgersen	3650
...

UNION

UNION does not really work here. It could be used if another survey comes in and you want all the ID that are associated to specimens of a group - species, island, sex... Select uid in both tables and filter for the group you are looking for - this returns all the IDs from both tables:

def get_penguins_union():
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT uid FROM penguins_a1 WHERE island='Dream' UNION SELECT uid FROM penguins_a2 WHERE island='Dream'"
        print(query)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()

Formating

Text Strings - Concatenations

def get_penguins_concat_strings(column1, column2, filter):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT {0}, {1}, {0} || ' (' || {1} || ')' FROM penguins_a WHERE sex = '{2}'".format(column1, column2, filter)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    results = get_penguins_concat_strings(column1='species', column2='island', filter='MALE')
    df = pd.DataFrame(results, columns=['Species', 'Island', 'Concat'])
    df.head()

	Species	Island	Concat
0	Adelie	Torgersen	Adelie (Torgersen)
1	Adelie	Torgersen	Adelie (Torgersen)
2	Adelie	Torgersen	Adelie (Torgersen)
3	Adelie	Torgersen	Adelie (Torgersen)
4	Adelie	Torgersen	Adelie (Torgersen)

Text Strings - Trim

TRIM, RTRIM, LTRIM to remove leading or trailing spaces from a string.

SELECT TRIM('#! ' FROM '    #SQL Tutorial!    ') AS TrimmedString;

TrimmedString
SQL Tutorial

Text Strings - SUBSTR, LOWER, UPPER

Generate IDs from Substrings - e.g. Island Codes:

def get_penguins_sub_strings(column1, column2, filter):
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT {0}, {1}, UPPER(SUBSTR({1},1,3)) FROM penguins_a".format(column1, column2, filter)
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    results = get_penguins_sub_strings(column1='species', column2='island')
    df = pd.DataFrame(results, columns=['Species', 'Island', 'Location Code'])
    df.head()

	Species	Island	Location Code
0	Adelie	Torgersen	TOR
1	Adelie	Torgersen	TOR
2	Adelie	Torgersen	TOR
3	Adelie	Torgersen	TOR
4	Adelie	Torgersen	TOR

Text Strings - STRFTIME

The penguins dataset does not contain timestamps ~ let's import another one that does:

tweet_data = pd.read_csv ('datasets_tweets.csv')   
df = pd.DataFrame(tweet_data)
df = df.drop(['negativereason', 'negativereason_confidence', 'text', 'tweet_location', 'tweet_coord', 'negativereason_gold', 'airline_sentiment_gold'], axis=1)
df = df.drop_duplicates(subset=['tweet_id'], keep='first')

 	 	 	 	 	 	 	 	 	 	 	 	 	 	
def create_table():
    conn = None
    try:
        # read database configuration
        params = config()
        # connect to the PostgreSQL database
        conn = psycopg2.connect(**params)
        # create a new cursor
        cursor = conn.cursor()
        # create the table
        cursor.execute('''
		CREATE TABLE tweets (
			tweet_id INT primary key,
			airline_sentiment VARCHAR,
            airline_sentiment_confidence FLOAT,
            airline VARCHAR,
            name VARCHAR,
            retweet_count INT,
            tweet_created TIMESTAMP,
            user_timezone VARCHAR
			)
               ''')
        conn.commit()
        # Close communication with the PostgreSQL database
        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

def add_data():
    conn = None
    try:
        # read database configuration
        params = config()
        # connect to the PostgreSQL database
        conn = psycopg2.connect(**params)
        # create a new cursor
        cursor = conn.cursor()
        # insert data
        for row in df.itertuples():
            cursor.execute('INSERT INTO tweets (tweet_id, airline_sentiment, airline_sentiment_confidence, airline, name, retweet_count, tweet_created, user_timezone) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (row.tweet_id, row.airline_sentiment, row.airline_sentiment_confidence, row.airline, row.name ,row.retweet_count, row.tweet_created, row.user_timezone))
        # Commit the changes to the database
        conn.commit()
        # Close communication with the PostgreSQL database
        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

	tweet_id	airline_sentiment	airline_sentiment_confidence	airline	name	tweet_created	user_timezone
0	570306133677760513	neutral	1.0000	Virgin America	cairdin	2015-02-24 11:35:52 -0800	Eastern Time (US & Canada)
1	570301130888122368	positive	0.3486	Virgin America	jnardino	2015-02-24 11:15:59 -0800	Pacific Time (US & Canada)
2	570301083672813571	neutral	0.6837	Virgin America	yvonnalynn	2015-02-24 11:15:48 -0800	Central Time (US & Canada)
3	570301031407624196	negative	1.0000	Virgin America	jnardino	2015-02-24 11:15:36 -0800	Pacific Time (US & Canada)
4	570300817074462722	negative	1.0000	Virgin America	jnardino	2015-02-24 11:14:45 -0800	Pacific Time (US & Canada)

def get_tweets_datetime_strings():
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT tweet_created, EXTRACT(ISOYEAR FROM tweet_created), EXTRACT(MONTH FROM tweet_created), EXTRACT(Day FROM tweet_created),  EXTRACT(DOW FROM tweet_created), CURRENT_DATE - tweet_created FROM tweets;"
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    results = get_tweets_datetime_strings()
    df = pd.DataFrame(results, columns=['Timestamp', 'Year', 'Month', 'Day', 'Day of the Week', 'Days Since'])
    df.head()

	Timestamp	Year	Month	Day	Day of the Week	Days Since
0	2015-02-24 11:35:52	2015	2	24	2	3048 days 12:24:08
1	2015-02-24 11:15:59	2015	2	24	2	3048 days 12:44:01
2	2015-02-24 11:15:48	2015	2	24	2	3048 days 12:44:12
3	2015-02-24 11:15:36	2015	2	24	2	3048 days 12:44:24
4	2015-02-24 11:14:45	2015	2	24	2	3048 days 12:45:15

Case Statements

CASE WHEN THEN ELSE - One-Hot Encoding

def get_penguins_binary_classification():
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT airline, airline_sentiment, CASE airline_sentiment WHEN 'negative' THEN 0 ELSE 1 END negative FROM tweets"
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    results = get_penguins_binary_classification()
    df = pd.DataFrame(results, columns=['Airline', 'Sentiment', 'Binary Sentiment'])
    df.head()

	Airline	Sentiment	Binary Sentiment
0	Virgin America	neutral	1
1	Virgin America	positive	1
2	Virgin America	neutral	1
3	Virgin America	negative	0
4	Virgin America	negative	0

CASE WHEN AND THEN ELSE - Multi-Classification

def get_penguins_multi_classification():
    """ query data from the penguins table """
    results = []
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        cursor = conn.cursor()
        query = "SELECT species, island, sex, CASE WHEN body_mass_g < 3000 THEN 'A' WHEN body_mass_g >= 3001 AND body_mass_g <=6000 THEN 'B' WHEN body_mass_g >= 6000 THEN 'C' ELSE 'something went wrong' END mass_class FROM penguins"
        cursor.execute(query)
        penguins = cursor.fetchall()
        
        for penguin in penguins:
            results.append(penguin)

        cursor.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            return(results)
            conn.close()


if __name__ == '__main__':
    results = get_penguins_multi_classification()
    df = pd.DataFrame(results, columns=['Species', 'Island', 'Sex', 'Mass Class'])
    df.head()

	Species	Island	Sex	Mass Class
0	Adelie	Torgersen	MALE	B
1	Adelie	Torgersen	FEMALE	B
2	Adelie	Torgersen	FEMALE	B
3	Adelie	Torgersen	FEMALE	B
4	Adelie	Torgersen	MALE	B

SQL in Data Science

SQL and Python​

Subqueries​

Create Table from CSV​

SELECT WHERE Subquery​

Joins​

INNER JOIN​

JOIN Aliases​

Multi INNER JOIN​

Self JOIN​

LEFT JOIN​

FULL OUTER JOIN​

UNION​

Formating​

Text Strings - Concatenations​

Text Strings - Trim​

Text Strings - SUBSTR, LOWER, UPPER​

Text Strings - STRFTIME​

Case Statements​

CASE WHEN THEN ELSE - One-Hot Encoding​

CASE WHEN AND THEN ELSE - Multi-Classification​