Skip to main content

Guangzhou, China

Github Repository

SQL in Data Science

PostgreSQL lets you build queries which run a variety of machine learning algorithms against your data.

  1. SQL in Data Science - The Basics using Python
  2. SQL in Data Science - Slightly more Advanced Queries
  3. SQL in Data Science - Machine Learning

Database Installation with Docker Compose

mkdir /opt/pgdata
sudo chown myuser:myuser /opt/pgdata
sudo chmod 777 /opt/pgdata
docker pull postgres:alpine

docker-compose.yml

# Use postgres/example user/password credentials
version: '3'

services:

db:
image: postgres:alpine
restart: unless-stopped
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: password
POSTGRES_DB: testing
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- /opt/pgdata:/var/lib/postgresql/data
ports:
- 5432:5432

adminer:
image: adminer
restart: unless-stopped
ports:
- 8080:8080
docker-compose up
import psycopg2
from config import config
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Dataset

!wget https://gist.github.com/Thanatoz-1/9e7fdfb8189f0cdf5d73a494e4a6392a/raw/aaecbd14aeaa468cd749528f291aa8a30c2ea09e/iris_dataset.csv
data_iris = pd.read_csv ('iris_dataset.csv')   
df_iris = pd.DataFrame(data_iris)
df_iris.rename(columns={
'sepal length (cm)': 'sepal_length',
'sepal width (cm)': 'sepal_width',
'petal length (cm)': 'petal_length',
'petal width (cm)': 'petal_width',
}, inplace=True)
df_iris.head(5)
sepal_lengthsepal_widthpetal_lengthpetal_widthtarget
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
def create_table():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# create the table
cursor.execute('''
CREATE TABLE iris (
index SMALLINT primary key,
sepal_length FLOAT,
sepal_width FLOAT,
petal_length FLOAT,
petal_width FLOAT,
target VARCHAR
)
''')
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
def add_data():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# insert data
for row in df_iris.itertuples():
cursor.execute('''
INSERT INTO iris (
index, sepal_length, sepal_width, petal_length, petal_width, target
) VALUES (
%s, %s, %s, %s, %s, %s
)''',
(
row.Index,
row.sepal_length,
row.sepal_width,
row.petal_length,
row.petal_width,
row.target
)
)
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
create_table()
add_data()

Linear Regression

query = f"""
WITH regression AS
(SELECT
regr_slope(petal_width, petal_length) AS gradient,
regr_intercept(petal_width, petal_length) AS intercept
FROM
iris
WHERE
petal_width IS NOT NULL
)

SELECT
petal_length,
(petal_length * gradient) + intercept AS prediction
FROM
iris
CROSS JOIN
regression
WHERE
petal_width IS NOT NULL ;
"""
def linear_regression(query):
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = query
cursor.execute(query)
results = cursor.fetchall()

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
results = linear_regression(query=query)
df_pred = pd.DataFrame(results, columns=['petal_length', 'petal_width_pred'])
df_pred.head()
petal_lengthpetal_width_pred
01.40.216473
11.40.216473
21.30.174831
31.50.258115
41.40.216473
fig = plt.figure(figsize=(10,5))
ax1 = fig.add_subplot(111)

ax1.scatter(data=df_pred, x='petal_length', y='petal_width_pred', s=10, c='b', marker="s", label='prediction')
ax1.scatter(data=df_iris, x='petal_length', y='petal_width', s=10, c='r', marker="o", label='true')
plt.legend(loc='upper left')
plt.xlabel('Iris Petal Length')
plt.ylabel('Iris Petal Width')

plt.savefig('assets/sql-for-data-science_01.webp', bbox_inches='tight')

SQL in Data Science

K-Nearest Neighbours

query = f'''
-- CTE to get labelled training data
WITH training AS
(SELECT
index,
POINT(sepal_length, petal_length) as xy,
target
FROM
iris
WHERE
target IS NOT NULL
),

-- CTE to get the unlabelled points
test AS
(SELECT
index,
POINT(sepal_length, petal_length) as xy,
target
FROM
iris
WHERE
target IS NOT NULL
),

-- calculate distances between unlabelled & labelled points
distances AS
(SELECT
test.index,
training.target,
test.xy <-> training.xy AS dist,
ROW_NUMBER() OVER (
PARTITION BY test.index
ORDER BY test.xy <-> training.xy
) AS row_no
FROM
test
CROSS JOIN training
ORDER BY 1, 4 ASC
),

-- count the 'votes' per label for each unlabelled point
votes AS
(SELECT
index,
target,
count(*) AS votes
FROM distances
WHERE row_no <= 5
GROUP BY 1,2
ORDER BY 1)

-- query for the label with the most votes
SELECT
v.index,
v.target
FROM
votes v
JOIN
(SELECT
index,
max(votes) AS max_votes
FROM
votes
GROUP BY 1
) mv
ON v.index = mv.index
AND v.votes = mv.max_votes
ORDER BY 1 ASC ;
'''
def k_nearest_neighbours(query):
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = query
cursor.execute(query)
results = cursor.fetchall()

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
# run training with k=5
results = k_nearest_neighbours(query=query)
df_pred = pd.DataFrame(results, columns=['index', 'target_pred'])
df_pred.head()
indextarget_pred
00Iris-setosa
11Iris-setosa
22Iris-setosa
33Iris-setosa
44Iris-setosa
df_iris['target_pred'] = df_pred['target_pred']
df_iris.head()
sepal_lengthsepal_widthpetal_lengthpetal_widthtargettarget_pred
05.13.51.40.2Iris-setosaIris-setosa
14.93.01.40.2Iris-setosaIris-setosa
24.73.21.30.2Iris-setosaIris-setosa
34.63.11.50.2Iris-setosaIris-setosa
45.03.61.40.2Iris-setosaIris-setosa
fig, axes = plt.subplots(1, 2, figsize=(12,5))

sns.scatterplot(
x='sepal_length',
y='petal_length',
data=df_iris,
s=40,
alpha=0.6,
hue='target_pred',
palette='nipy_spectral',
ax=axes[0]
)

sns.scatterplot(
x='sepal_length',
y='petal_length',
data=df_iris,
s=40,
alpha=0.6,
hue='target',
palette='nipy_spectral',
ax=axes[1]
)

plt.savefig('assets/sql-for-data-science_02.webp', bbox_inches='tight')

SQL in Data Science

K-means Clustering

query = f'''
WITH points AS
(SELECT
index,
POINT(sepal_length, petal_length) AS xy
FROM
iris
),

initial AS
(SELECT
RANK() OVER (
ORDER BY random()
) AS cluster,
xy
FROM points
-- set k=3 since we know there are 3 classes
LIMIT 3
),

iteration AS
(WITH RECURSIVE kmeans(iter, index, cluster, avg_point) AS (
SELECT
1,
NULL::INTEGER,
*
FROM
initial
UNION ALL
SELECT
iter + 1,
index,
cluster,
midpoint
FROM (
SELECT DISTINCT ON(iter, index)
*
FROM (
SELECT
iter,
cluster,
p.index,
p.xy <-> k.avg_point AS distance,
@@ LSEG(p.xy, k.avg_point) AS midpoint,
p.xy,
k.avg_point
FROM points p
CROSS JOIN kmeans k
) d
ORDER BY 1, 3, 4
) r
WHERE iter < 1000
)
SELECT
*
FROM
kmeans
)

SELECT
k.*,
cluster
FROM
iteration i
JOIN
iris k
USING(index)
WHERE
iter = 1000
ORDER BY 4,1 ASC ;
'''
def k_means(query):
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = query
cursor.execute(query)
results = cursor.fetchall()

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
# run training with k=3 / max_iter=1000
results = k_nearest_neighbours(query=query)
df_pred = pd.DataFrame(results, columns=['index', 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target', 'target_pred'])
df_pred.head()
fig, axes = plt.subplots(1, 2, figsize=(12,5))

sns.scatterplot(
x='sepal_length',
y='petal_length',
data=df_pred,
s=40,
alpha=0.6,
hue='target_pred',
palette='nipy_spectral',
ax=axes[0]
)

sns.scatterplot(
x='sepal_length',
y='petal_length',
data=df_pred,
s=40,
alpha=0.6,
hue='target',
palette='nipy_spectral',
ax=axes[1]
)

plt.savefig('assets/sql-for-data-science_03.webp', bbox_inches='tight')

SQL in Data Science

Dataset II

!wget https://github.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/raw/master/spam.csv
data_spam = pd.read_csv ('spam_dataset.csv')   
df_spam = pd.DataFrame(data_spam)
df_spam = df_spam.reset_index()
df_spam.head(5)
indexcatsms
0hamGo until jurong point crazy.. Available only i...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf he lives arou...
def create_table():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# create the table
cursor.execute('''
CREATE TABLE spam (
index SMALLINT primary key,
cat VARCHAR,
sms VARCHAR
)
''')
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
def add_data():
conn = None
try:
# read database configuration
params = config()
# connect to the PostgreSQL database
conn = psycopg2.connect(**params)
# create a new cursor
cursor = conn.cursor()
# insert data
for row in df_spam.itertuples():
cursor.execute('''
INSERT INTO spam (
index, cat, sms
) VALUES (
%s, %s, %s
)''',
(
row.index,
row.cat,
row.sms
)
)
# Commit the changes to the database
conn.commit()
# Close communication with the PostgreSQL database
cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
create_table()
add_data()

Naive Bayes Classification

query = f'''
-- CTE to create one row per word
WITH staging AS
(SELECT
REGEXP_SPLIT_TO_TABLE(
LOWER(sms), '[^a-z]+') AS word,
cat
FROM
spam
WHERE
cat IS NOT NULL
),

-- testing data
test AS
(SELECT
index,
sms
FROM
spam
WHERE
cat is NOT NULL
),

-- one row per word + cat
cartesian AS
(SELECT
*
FROM
(SELECT
DISTINCT word
FROM
staging) w
CROSS JOIN
(SELECT
DISTINCT cat
FROM
staging) c
WHERE
length(word) > 0
),

-- CTE of smoothed frequencies of each word by cat
frequencies AS
(SELECT
c.word,
c.cat,
-- numerator plus one
(SELECT
count(*)+1
FROM
staging s
WHERE
s.word = c.word
AND
s.cat = c.cat) /
-- denominator plus two
(SELECT
count(*)+2
FROM
staging s1
WHERE
s1.cat = c.cat) ::DECIMAL AS freq
FROM
cartesian c
),

-- for each row in testing, get the probabilities
probabilities AS
(SELECT
t.index,
f.cat,
SUM(LN(f.freq)) AS probability
FROM
(SELECT
index,
REGEXP_SPLIT_TO_TABLE(
LOWER(sms), '[^a-z]+') AS word
FROM
test) t
JOIN
(SELECT
word,
cat,
freq
FROM
frequencies) f
ON t.word = f.word
GROUP BY 1, 2
)

-- keep only the highest estimate
SELECT
sms,
probabilities.cat
FROM
probabilities
JOIN
(SELECT
index,
max(probability) AS max_probability
FROM
probabilities
GROUP BY 1) p
ON probabilities.index = p.index
AND probabilities.probability = p.max_probability
JOIN
test
ON probabilities.index = test.index
ORDER BY 1 ;
'''
def naive_bayes(query):
conn = None
try:
params = config()
conn = psycopg2.connect(**params)
cursor = conn.cursor()
query = query
cursor.execute(query)
results = cursor.fetchall()

cursor.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
return(results)
conn.close()
# buckle up....this will take a while
results = naive_bayes(query=query)
df_pred = pd.DataFrame(results, columns=['message', 'cat_pred'])
df_pred[df_pred['cat_pred'] == 'ham'].head(25)
messagecat_pred
0all write or wat..ham
1bot notes oredi... Cos i juz rem i got...ham
2called dad oredi...ham
3collecting ur laptop then going to configu...ham
4come lt 25 n pass to me larham
5comin to fetch us oredi...ham
6dun need to pick ur gf?ham
7dun wan to watch infernal affair?ham
8eatin later but i'm eatin wif my frens now...ham
9give me some time to walk there.ham
10go home liao? Ask dad to pick me up at 6...ham
11got wat to buy tell us then _ no need to...ham
12log off 4 wat. It's sdryb8iham
13mean it's confirmed... I tot they juz say ...ham
14neva tell me how i noe... I'm not at home ...ham
15no home work to do meh...ham
16only send me the contents page...ham
17predict wat time _'ll finish buying?ham
18ready then call me...ham
19say until like dat i dun buy ericsson oso ...ham
20still attending da talks?ham
21still got lessons? in sch?ham
23thk of wat to eat tonight.ham
24v ma fan...ham
25wait 4 me in sch i finish ard 5..ham
df_pred[df_pred['cat_pred'] == 'spam'].head(25)
messagecat_pred
22takin linear algebra today?spam
34and picking them up from various pointsspam
53(Bank of Granite issues Strong-Buy) EXPLOSIVE ...spam
60* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...spam
68FREE MESSAGEThanks for using the Auction S...spam
70+123 Congratulations - in this week's competit...spam
71+123 Congratulations - in this week's competit...spam
72+449071512431 URGENT! This is the 2nd attempt ...spam
73-PLS STOP bootydelious (32/F) is inviting you ...spam
7707732584351 - Rodger Burns - MSG = We tried to...spam
7808714712388 between 10am-7pm Cost 10pspam
7909066362231 URGENT! Your mobile No 07xxxxxxxxx...spam
800A$NETWORKS allow companies to bill for SMS so...spam
91100 dating service cal;l 09064012103 box334sk38chspam
921000's flirting NOW! Txt GIRL or BLOKE & ur NA...spam
931000's of girls many local 2 u who r virgins 2...spam
9418 days to Euro2004 kickoff! U will be kept in...spam
9518 days to Euro2004 kickoff! U will be kept in...spam
961Apple/Day=No Doctor. 1Tulsi Leaf/Day=No Cance...spam
971st wk FREE! Gr8 tones str8 2 u each wk. Txt N...spam
1012/2 146tf150pspam
10222 days to kick off! For Euro2004 U will be ke...spam
10325p 4 alfie Moon's Children in need song on ur...spam
10426th OF JULYspam
1072p per min to call Germany 08448350055 from yo...spam