- SQL in Data Science
SQL in Data Science
PostgreSQL lets you build queries which run a variety of machine learning algorithms against your data.
- SQL in Data Science - The Basics using Python
- SQL in Data Science - Slightly more Advanced Queries
- SQL in Data Science - Machine Learning
Database Installation with Docker Compose
mkdir /opt/pgdata
sudo chown myuser:myuser /opt/pgdata
sudo chmod 777 /opt/pgdata
docker pull postgres:alpine
docker-compose.yml
# Use postgres/example user/password credentials
version: '3'
services:
db:
image: postgres:alpine
restart: unless-stopped
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: password
POSTGRES_DB: testing
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- /opt/pgdata:/var/lib/postgresql/data
ports:
- 5432:5432
adminer:
image: adminer
restart: unless-stopped
ports:
- 8080:8080
docker-compose up
http://localhost:8080
SQL and Python
pip install psycopg2-binary
Create a connection object:
hello_world.py
import psycopg2
DB = 'testing'
HOST = 'localhost'
USER = 'postgres'
PASS = 'password'
PORT = 5432
conn = psycopg2.connect(
database = DB,
host = HOST,
user = USER,
password = PASS,
port = PORT
)
A cursor object will help you execute any queries on the database and retrieve data. Here's how to create a cursor object:
cursor = conn.cursor()
Creating Tables
def create_tables():
""" create tables in the PostgreSQL database"""
commands = (
"""
CREATE TABLE vendors (
vendor_id SERIAL PRIMARY KEY,
vendor_name VARCHAR(255) NOT NULL
)
""",
""" CREATE TABLE parts (
part_id SERIAL PRIMARY KEY,
part_name VARCHAR(255) NOT NULL
)
""",
"""
CREATE TABLE part_drawings (
part_id INTEGER PRIMARY KEY,
file_extension VARCHAR(5) NOT NULL,
drawing_data BYTEA NOT NULL,
FOREIGN KEY (part_id)
REFERENCES parts (part_id)
ON UPDATE CASCADE ON DELETE CASCADE
)
""",
"""
CREATE TABLE vendor_parts (
vendor_id INTEGER NOT NULL,
part_id INTEGER NOT NULL,
PRIMARY KEY (vendor_id , part_id),
FOREIGN KEY (vendor_id)
REFERENCES vendors (vendor_id)
ON UPDATE CASCADE ON DELETE CASCADE,
FOREIGN KEY (part_id)
REFERENCES parts (part_id)
ON UPDATE CASCADE ON DELETE CASCADE
)
""")
)
try:
for command in commands:
cursor.execute(command)
# close communication with the PostgreSQL database server
cursor.close()
# commit the changes
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
print('INFO :: Database connection closed.')
We can test the connection by running:
python 01_create_read_table.py
INFO :: Connecting to database...
INFO :: PostgreSQL database version:
('PostgreSQL 15.3 on x86_64-pc-linux-musl, compiled by gcc (Alpine 12.2.1_git20220924-r10) 12.2.1 20220924, 64-bit',)
INFO :: Database connection closed.
Inserting Data
def insert_vendor(vendor_name):
""" insert a new vendor into the vendors table """
sql = """INSERT INTO vendors(vendor_name)
VALUES(%s) RETURNING vendor_id;"""
conn = None
vendor_id = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cur = conn.cursor()
# execute the INSERT statement
cur.execute(sql, (vendor_name,))
# get the generated id back
vendor_id = cur.fetchone()[0]
# commit the changes to the database
conn.commit()
# close communication with the database
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
return vendor_id
if __name__ == '__main__':
# insert one vendor
insert_vendor("3M Co.")