Skip to main content

Guangzhou, China

Github Repository

SQL in Data Science

PostgreSQL lets you build queries which run a variety of machine learning algorithms against your data.

  1. SQL in Data Science - The Basics using Python
  2. SQL in Data Science - Slightly more Advanced Queries
  3. SQL in Data Science - Machine Learning

Database Installation with Docker Compose

mkdir /opt/pgdata
sudo chown myuser:myuser /opt/pgdata
sudo chmod 777 /opt/pgdata
docker pull postgres:alpine

docker-compose.yml

# Use postgres/example user/password credentials
version: '3'

services:

db:
image: postgres:alpine
restart: unless-stopped
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: password
POSTGRES_DB: testing
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- /opt/pgdata:/var/lib/postgresql/data
ports:
- 5432:5432

adminer:
image: adminer
restart: unless-stopped
ports:
- 8080:8080
docker-compose up
http://localhost:8080

SQL in Data Science

SQL and Python

pip install psycopg2-binary

Create a connection object:

hello_world.py

import psycopg2

DB = 'testing'
HOST = 'localhost'
USER = 'postgres'
PASS = 'password'
PORT = 5432

conn = psycopg2.connect(
database = DB,
host = HOST,
user = USER,
password = PASS,
port = PORT
)

A cursor object will help you execute any queries on the database and retrieve data. Here's how to create a cursor object:

cursor = conn.cursor()

Creating Tables

def create_tables():
""" create tables in the PostgreSQL database"""
commands = (
"""
CREATE TABLE vendors (
vendor_id SERIAL PRIMARY KEY,
vendor_name VARCHAR(255) NOT NULL
)
""",
""" CREATE TABLE parts (
part_id SERIAL PRIMARY KEY,
part_name VARCHAR(255) NOT NULL
)
""",
"""
CREATE TABLE part_drawings (
part_id INTEGER PRIMARY KEY,
file_extension VARCHAR(5) NOT NULL,
drawing_data BYTEA NOT NULL,
FOREIGN KEY (part_id)
REFERENCES parts (part_id)
ON UPDATE CASCADE ON DELETE CASCADE
)
""",
"""
CREATE TABLE vendor_parts (
vendor_id INTEGER NOT NULL,
part_id INTEGER NOT NULL,
PRIMARY KEY (vendor_id , part_id),
FOREIGN KEY (vendor_id)
REFERENCES vendors (vendor_id)
ON UPDATE CASCADE ON DELETE CASCADE,
FOREIGN KEY (part_id)
REFERENCES parts (part_id)
ON UPDATE CASCADE ON DELETE CASCADE
)
""")
)
try:
for command in commands:
cursor.execute(command)
# close communication with the PostgreSQL database server
cursor.close()
# commit the changes
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
print('INFO :: Database connection closed.')

We can test the connection by running:

python 01_create_read_table.py

INFO :: Connecting to database...
INFO :: PostgreSQL database version:
('PostgreSQL 15.3 on x86_64-pc-linux-musl, compiled by gcc (Alpine 12.2.1_git20220924-r10) 12.2.1 20220924, 64-bit',)
INFO :: Database connection closed.

SQL in Data Science

Inserting Data

def insert_vendor(vendor_name):
""" insert a new vendor into the vendors table """
sql = """INSERT INTO vendors(vendor_name)
VALUES(%s) RETURNING vendor_id;"""
conn = None
vendor_id = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cur = conn.cursor()
# execute the INSERT statement
cur.execute(sql, (vendor_name,))
# get the generated id back
vendor_id = cur.fetchone()[0]
# commit the changes to the database
conn.commit()
# close communication with the database
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()

return vendor_id

if __name__ == '__main__':
# insert one vendor
insert_vendor("3M Co.")

SQL in Data Science

Updating Data

def update_vendor(vendor_id, vendor_name):
""" update vendor name based on the vendor id """
sql = """ UPDATE vendors
SET vendor_name = %s
WHERE vendor_id = %s"""
conn = None
updated_rows = 0
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cur = conn.cursor()
# execute the UPDATE statement
cur.execute(sql, (vendor_name, vendor_id))
# get the number of updated rows
updated_rows = cur.rowcount
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()

return updated_rows


if __name__ == '__main__':
# Update vendor id 1
update_vendor(1, "3M Corp")

SQL in Data Science

Retrieving Data

We use the execute() function and submit a query string as its argument. This query that we submitted will be run against the database:

cursor.execute("SELECT * FROM my_table")

After the query executes, we need to use one of these functions to retrieve data row(s):

  • fetchone() - print(cursor.fetchone())
  • fetchall() - print(cursor.fetchall())
  • fetchmany() - print(cursor.fetchmany(size=3))

Querying data using fetchone() method

def get_vendor():
""" query data from the vendors table """
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
cursor.execute("SELECT vendor_id, vendor_name FROM vendors ORDER BY vendor_name")
print("The number of vendors: ", cursor.rowcount)

print(cursor.fetchone())

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
python 04_fetch_data.py

The number of vendors: 7
(1, '3M Corp')

Querying data using fetchall() method

def get_all_vendors():
""" query data from the vendors table """
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
cursor.execute("SELECT vendor_id, vendor_name FROM vendors ORDER BY vendor_name")
rows = cursor.fetchall()
print("The number of vendors: ", cursor.rowcount)
for row in rows:
print(row)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
python 04_fetch_data.py

The number of vendors: 7
(1, '3M Corp')
(2, 'AKM Semiconductor Inc.')
(3, 'Asahi Glass Co Ltd.')
(4, 'Daikin Industries Ltd.')
(5, 'Dynacast International Inc.')
(6, 'Foster Electric Co. Ltd.')
(7, 'Murata Manufacturing Co. Ltd.')
The number of vendors: 7

Querying data using fetchmany() method

def get_vendors(size):
""" query data from the vendors table """
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
cursor.execute("SELECT vendor_id, vendor_name FROM vendors ORDER BY vendor_name")
rows = cursor.fetchmany(size=size)
print("The number of vendors: ", cursor.rowcount)
for row in rows:
print(row)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
python 04_fetch_data.py

The number of vendors: 7
(1, '3M Corp')
(2, 'AKM Semiconductor Inc.')
(3, 'Asahi Glass Co Ltd.')

Deleting Data

def delete_part(part_id):
""" delete vendors by id """
conn = None
rows_deleted = 0
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# execute the UPDATE statement
cursor.execute("DELETE FROM vendors WHERE vendor_id = %s", (part_id,))
# get the number of updated rows
rows_deleted = cursor.rowcount
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()

return rows_deleted


if __name__ == '__main__':
deleted_rows = delete_part(2)
print('The number of deleted rows: ', deleted_rows)
python 05_delete_data.py
The number of deleted rows: 1

Create Table from CSV

data = pd.read_csv ('dataset_penguins.csv')   
df = pd.DataFrame(data)


def create_table():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# create the table
cursor.execute('''
CREATE TABLE penguins (
uid CHAR(25) primary key,
species VARCHAR(10),
island VARCHAR(10),
culmen_length_mm FLOAT,
culmen_depth_mm FLOAT,
flipper_length_mm FLOAT,
body_mass_g INT,
sex VARCHAR(10)
)
''')
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
def add_data():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# insert data
for row in df.itertuples():
cursor.execute('INSERT INTO penguins (uid, species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (row.uid, row.species, row.island, row.culmen_length_mm, row.culmen_depth_mm, row.flipper_length_mm, row.body_mass_g, row.sex))
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)

Filter Conditions

WHERE

def get_penguins_by_id(uid):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT (species, island, body_mass_g, sex) FROM penguins WHERE uid = '{0}'".format(uid)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_by_id(uid='649bc5d39b21a1b93e0985a5')
print(results)

WHERE by Value

def get_penguins_by_mass_thresh(body_mass_g):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT (species, island, sex) FROM penguins WHERE body_mass_g >= '{0}'".format(body_mass_g)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_by_mass_thresh(body_mass_g=6000)
print(results)

WHERE BETWEEN Values

def get_penguins_by_mass_range(lower, upper):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT (species, island, sex) FROM penguins WHERE body_mass_g BETWEEN '{0}' AND '{1}'".format(lower, upper)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_by_mass_range(lower=4300, upper=4500)
print(results)

WHERE by No-Match

def get_penguins_by_non_match(species):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT (species, island, sex) FROM penguins WHERE species <> '{0}'".format(species)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_by_non_match(species='Adelie')
print(results)

WHERE IS NULL

def get_penguins_where_sex_is_null():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT (uid, species, island) FROM penguins WHERE sex IS NULL"

cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_where_sex_is_null()
print(results)

WHERE IN Group

def get_penguins_in(group):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT (species, island, sex) FROM penguins WHERE uid IN {0}".format(group)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_in(
group=('649bc5d39b21a1b93e0985a5', '649bc5d39b21a1b93e0986de', '649bc5d39b21a1b93e098671')
)
print(results)

WHERE Value AND Value

def get_penguins_and(body_mass_g, island):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT (species, island, sex) FROM penguins WHERE body_mass_g >= '{0}' AND island = '{1}'".format(body_mass_g, island)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_and(body_mass_g=6000, island='Biscoe')
print(results)

WHERE Value OR Value

def get_penguins_or(island1, island2):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE (island = '{0}' OR island = '{1}')".format(island1, island2)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_or(island1='Biscoe',island2='Dream')
print(results)

WHERE NOT

def get_penguins_not(island, sex):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE NOT island = '{0}' AND NOT sex = '{1}'".format(island, sex)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_not(island='Biscoe',sex='MALE')
print(results)

WHERE LIKE

def get_penguins_like(island, sex):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE island LIKE '{0}' AND sex LIKE '{1}%'".format(island, sex)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_like(island='%eam',sex='MA%')
print(results)

Transformations

ORDER BY

def get_penguins_return_ordered(island, orderby):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE island = '{0}' ORDER BY {1} DESC".format(island, orderby)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_return_ordered(island='Dream', orderby='species')
print(results)

Return AS

def get_penguins_return_math(uid):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (culmen_length_mm + culmen_depth_mm) / body_mass_g AS ident FROM penguins WHERE uid = '{0}'".format(uid)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_return_math(uid='649bc5d39b21a1b93e0985a3')
print(results)

Aggregations

AVG(), COUNT(), MIN(), MAX(), SUM()

def get_penguins_return_avg(column):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT AVG({0}) AS average FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_return_avg(column='body_mass_g')
print(results)
def get_penguins_return_count(column):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT COUNT({0}) AS row_count FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# count all rows in a table
results = get_penguins_return_count(column='*')
print(results)
# count all rows in specified column excluding NULL values
results = get_penguins_return_count(column='sex')
print(results)
def get_penguins_return_count_distinct(column):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT COUNT(DISTINCT {0}) AS distinct_row_count FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# count all rows in specified column excluding NULL values and duplicates
results = get_penguins_return_count(column='uid')
print(results)
def get_penguins_return_minmax(column):
""" query data from the penguins table """
min_values = []
max_values = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT MIN({0}) AS average FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
min_values.append(penguin)

query = "SELECT MAX({0}) AS average FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
max_values.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(min_values, max_values)
conn.close()


if __name__ == '__main__':
results = get_penguins_return_minmax(column='body_mass_g')
print(results)
def get_penguins_return_sum_total(column1, column2):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()

query = "SELECT SUM({0} + {1}) AS total FROM penguins".format(column1, column2)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
results = get_penguins_return_sum_total(column1='culmen_length_mm', column2='culmen_depth_mm')
print(results)

GROUP BY

def get_penguins_return_count_groups(column, groups):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT COUNT(DISTINCT {0}) AS distinct_row_count FROM penguins GROUP BY {1}".format(column, groups)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# this returns 3 groups (MALE, FEMALE, NULL) for all 3 island groups
results = get_penguins_return_count_groups(column='uid', groups='island, sex')
print(results)

GROUP BY Aggregation

def get_penguins_return_group_count_agg(group, condition):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, COUNT(*) AS distinct_row_count FROM penguins GROUP BY {0} HAVING COUNT (*) {1}".format(group, condition)
cursor.execute(query)
penguins = cursor.fetchall()

for penguin in penguins:
results.append(penguin)

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()


if __name__ == '__main__':
# return counts for all species that have more than 100 specimens
results = get_penguins_return_group_count_agg(group='species', condition='>100')
print(results)