- SQL in Data Science
SQL in Data Science
PostgreSQL lets you build queries which run a variety of machine learning algorithms against your data.
- SQL in Data Science - The Basics using Python
- SQL in Data Science - Slightly more Advanced Queries
- SQL in Data Science - Machine Learning
Database Installation with Docker Compose
mkdir /opt/pgdata
sudo chown myuser:myuser /opt/pgdata
sudo chmod 777 /opt/pgdata
docker pull postgres:alpine
docker-compose.yml
# Use postgres/example user/password credentials
version: '3'
services:
db:
image: postgres:alpine
restart: unless-stopped
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: password
POSTGRES_DB: testing
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- /opt/pgdata:/var/lib/postgresql/data
ports:
- 5432:5432
adminer:
image: adminer
restart: unless-stopped
ports:
- 8080:8080
docker-compose up
http://localhost:8080
SQL and Python
pip install psycopg2-binary
Create a connection object:
hello_world.py
import psycopg2
DB = 'testing'
HOST = 'localhost'
USER = 'postgres'
PASS = 'password'
PORT = 5432
conn = psycopg2.connect(
database = DB,
host = HOST,
user = USER,
password = PASS,
port = PORT
)
A cursor object will help you execute any queries on the database and retrieve data. Here's how to create a cursor object:
cursor = conn.cursor()
Creating Tables
def create_tables():
""" create tables in the PostgreSQL database"""
commands = (
"""
CREATE TABLE vendors (
vendor_id SERIAL PRIMARY KEY,
vendor_name VARCHAR(255) NOT NULL
)
""",
""" CREATE TABLE parts (
part_id SERIAL PRIMARY KEY,
part_name VARCHAR(255) NOT NULL
)
""",
"""
CREATE TABLE part_drawings (
part_id INTEGER PRIMARY KEY,
file_extension VARCHAR(5) NOT NULL,
drawing_data BYTEA NOT NULL,
FOREIGN KEY (part_id)
REFERENCES parts (part_id)
ON UPDATE CASCADE ON DELETE CASCADE
)
""",
"""
CREATE TABLE vendor_parts (
vendor_id INTEGER NOT NULL,
part_id INTEGER NOT NULL,
PRIMARY KEY (vendor_id , part_id),
FOREIGN KEY (vendor_id)
REFERENCES vendors (vendor_id)
ON UPDATE CASCADE ON DELETE CASCADE,
FOREIGN KEY (part_id)
REFERENCES parts (part_id)
ON UPDATE CASCADE ON DELETE CASCADE
)
""")
)
try:
for command in commands:
cursor.execute(command)
# close communication with the PostgreSQL database server
cursor.close()
# commit the changes
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
print('INFO :: Database connection closed.')
We can test the connection by running:
python 01_create_read_table.py
INFO :: Connecting to database...
INFO :: PostgreSQL database version:
('PostgreSQL 15.3 on x86_64-pc-linux-musl, compiled by gcc (Alpine 12.2.1_git20220924-r10) 12.2.1 20220924, 64-bit',)
INFO :: Database connection closed.
Inserting Data
def insert_vendor(vendor_name):
""" insert a new vendor into the vendors table """
sql = """INSERT INTO vendors(vendor_name)
VALUES(%s) RETURNING vendor_id;"""
conn = None
vendor_id = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cur = conn.cursor()
# execute the INSERT statement
cur.execute(sql, (vendor_name,))
# get the generated id back
vendor_id = cur.fetchone()[0]
# commit the changes to the database
conn.commit()
# close communication with the database
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
return vendor_id
if __name__ == '__main__':
# insert one vendor
insert_vendor("3M Co.")
Updating Data
def update_vendor(vendor_id, vendor_name):
""" update vendor name based on the vendor id """
sql = """ UPDATE vendors
SET vendor_name = %s
WHERE vendor_id = %s"""
conn = None
updated_rows = 0
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cur = conn.cursor()
# execute the UPDATE statement
cur.execute(sql, (vendor_name, vendor_id))
# get the number of updated rows
updated_rows = cur.rowcount
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
return updated_rows
if __name__ == '__main__':
# Update vendor id 1
update_vendor(1, "3M Corp")
Retrieving Data
We use the execute()
function and submit a query string as its argument. This query that we submitted will be run against the database:
cursor.execute("SELECT * FROM my_table")
After the query executes, we need to use one of these functions to retrieve data row(s):
- fetchone() -
print(cursor.fetchone())
- fetchall() -
print(cursor.fetchall())
- fetchmany() -
print(cursor.fetchmany(size=3))
Querying data using fetchone() method
def get_vendor():
""" query data from the vendors table """
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
cursor.execute("SELECT vendor_id, vendor_name FROM vendors ORDER BY vendor_name")
print("The number of vendors: ", cursor.rowcount)
print(cursor.fetchone())
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
python 04_fetch_data.py
The number of vendors: 7
(1, '3M Corp')
Querying data using fetchall() method
def get_all_vendors():
""" query data from the vendors table """
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
cursor.execute("SELECT vendor_id, vendor_name FROM vendors ORDER BY vendor_name")
rows = cursor.fetchall()
print("The number of vendors: ", cursor.rowcount)
for row in rows:
print(row)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
python 04_fetch_data.py
The number of vendors: 7
(1, '3M Corp')
(2, 'AKM Semiconductor Inc.')
(3, 'Asahi Glass Co Ltd.')
(4, 'Daikin Industries Ltd.')
(5, 'Dynacast International Inc.')
(6, 'Foster Electric Co. Ltd.')
(7, 'Murata Manufacturing Co. Ltd.')
The number of vendors: 7
Querying data using fetchmany() method
def get_vendors(size):
""" query data from the vendors table """
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
cursor.execute("SELECT vendor_id, vendor_name FROM vendors ORDER BY vendor_name")
rows = cursor.fetchmany(size=size)
print("The number of vendors: ", cursor.rowcount)
for row in rows:
print(row)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
python 04_fetch_data.py
The number of vendors: 7
(1, '3M Corp')
(2, 'AKM Semiconductor Inc.')
(3, 'Asahi Glass Co Ltd.')
Deleting Data
def delete_part(part_id):
""" delete vendors by id """
conn = None
rows_deleted = 0
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# execute the UPDATE statement
cursor.execute("DELETE FROM vendors WHERE vendor_id = %s", (part_id,))
# get the number of updated rows
rows_deleted = cursor.rowcount
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
return rows_deleted
if __name__ == '__main__':
deleted_rows = delete_part(2)
print('The number of deleted rows: ', deleted_rows)
python 05_delete_data.py
The number of deleted rows: 1
Create Table from CSV
data = pd.read_csv ('dataset_penguins.csv')
df = pd.DataFrame(data)
def create_table():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# create the table
cursor.execute('''
CREATE TABLE penguins (
uid CHAR(25) primary key,
species VARCHAR(10),
island VARCHAR(10),
culmen_length_mm FLOAT,
culmen_depth_mm FLOAT,
flipper_length_mm FLOAT,
body_mass_g INT,
sex VARCHAR(10)
)
''')
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
def add_data():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# insert data
for row in df.itertuples():
cursor.execute('INSERT INTO penguins (uid, species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (row.uid, row.species, row.island, row.culmen_length_mm, row.culmen_depth_mm, row.flipper_length_mm, row.body_mass_g, row.sex))
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
Filter Conditions
WHERE
def get_penguins_by_id(uid):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, body_mass_g, sex) FROM penguins WHERE uid = '{0}'".format(uid)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_by_id(uid='649bc5d39b21a1b93e0985a5')
print(results)
WHERE by Value
def get_penguins_by_mass_thresh(body_mass_g):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE body_mass_g >= '{0}'".format(body_mass_g)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_by_mass_thresh(body_mass_g=6000)
print(results)
WHERE BETWEEN Values
def get_penguins_by_mass_range(lower, upper):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE body_mass_g BETWEEN '{0}' AND '{1}'".format(lower, upper)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_by_mass_range(lower=4300, upper=4500)
print(results)
WHERE by No-Match
def get_penguins_by_non_match(species):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE species <> '{0}'".format(species)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_by_non_match(species='Adelie')
print(results)
WHERE IS NULL
def get_penguins_where_sex_is_null():
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (uid, species, island) FROM penguins WHERE sex IS NULL"
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_where_sex_is_null()
print(results)
WHERE IN Group
def get_penguins_in(group):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE uid IN {0}".format(group)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_in(
group=('649bc5d39b21a1b93e0985a5', '649bc5d39b21a1b93e0986de', '649bc5d39b21a1b93e098671')
)
print(results)
WHERE Value AND Value
def get_penguins_and(body_mass_g, island):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE body_mass_g >= '{0}' AND island = '{1}'".format(body_mass_g, island)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_and(body_mass_g=6000, island='Biscoe')
print(results)
WHERE Value OR Value
def get_penguins_or(island1, island2):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE (island = '{0}' OR island = '{1}')".format(island1, island2)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_or(island1='Biscoe',island2='Dream')
print(results)
WHERE NOT
def get_penguins_not(island, sex):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE NOT island = '{0}' AND NOT sex = '{1}'".format(island, sex)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_not(island='Biscoe',sex='MALE')
print(results)
WHERE LIKE
def get_penguins_like(island, sex):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE island LIKE '{0}' AND sex LIKE '{1}%'".format(island, sex)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_like(island='%eam',sex='MA%')
print(results)
Transformations
ORDER BY
def get_penguins_return_ordered(island, orderby):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (species, island, sex) FROM penguins WHERE island = '{0}' ORDER BY {1} DESC".format(island, orderby)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_return_ordered(island='Dream', orderby='species')
print(results)
Return AS
def get_penguins_return_math(uid):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT (culmen_length_mm + culmen_depth_mm) / body_mass_g AS ident FROM penguins WHERE uid = '{0}'".format(uid)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_return_math(uid='649bc5d39b21a1b93e0985a3')
print(results)
Aggregations
AVG()
,COUNT()
,MIN()
,MAX()
,SUM()
def get_penguins_return_avg(column):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT AVG({0}) AS average FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_return_avg(column='body_mass_g')
print(results)
def get_penguins_return_count(column):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT COUNT({0}) AS row_count FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# count all rows in a table
results = get_penguins_return_count(column='*')
print(results)
# count all rows in specified column excluding NULL values
results = get_penguins_return_count(column='sex')
print(results)
def get_penguins_return_count_distinct(column):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT COUNT(DISTINCT {0}) AS distinct_row_count FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# count all rows in specified column excluding NULL values and duplicates
results = get_penguins_return_count(column='uid')
print(results)
def get_penguins_return_minmax(column):
""" query data from the penguins table """
min_values = []
max_values = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT MIN({0}) AS average FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
min_values.append(penguin)
query = "SELECT MAX({0}) AS average FROM penguins".format(column)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
max_values.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(min_values, max_values)
conn.close()
if __name__ == '__main__':
results = get_penguins_return_minmax(column='body_mass_g')
print(results)
def get_penguins_return_sum_total(column1, column2):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT SUM({0} + {1}) AS total FROM penguins".format(column1, column2)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
results = get_penguins_return_sum_total(column1='culmen_length_mm', column2='culmen_depth_mm')
print(results)
GROUP BY
def get_penguins_return_count_groups(column, groups):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT COUNT(DISTINCT {0}) AS distinct_row_count FROM penguins GROUP BY {1}".format(column, groups)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# this returns 3 groups (MALE, FEMALE, NULL) for all 3 island groups
results = get_penguins_return_count_groups(column='uid', groups='island, sex')
print(results)
GROUP BY Aggregation
def get_penguins_return_group_count_agg(group, condition):
""" query data from the penguins table """
results = []
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = "SELECT {0}, COUNT(*) AS distinct_row_count FROM penguins GROUP BY {0} HAVING COUNT (*) {1}".format(group, condition)
cursor.execute(query)
penguins = cursor.fetchall()
for penguin in penguins:
results.append(penguin)
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
if __name__ == '__main__':
# return counts for all species that have more than 100 specimens
results = get_penguins_return_group_count_agg(group='species', condition='>100')
print(results)