Skip to main content

Python - Natural Language Processing

Sham Sui Po, Hong Kong

Github Repository

Natural Language Toolkit

Finding Word Stems

Use a Lemmatisation function to find the root of a word and compare different forms of the same word - e.g. be, was, is, etc. Install the nltk package:

pip install nltk

And download all the data with python -m nltk.downloader all or just the requiered files to the default location /home/myuser/nltk_data by adding the following lines to your code:

nltk.download('wordnet)
nltk.download('omw-1.4')

A quick loop through the following dictionary reveals that all those words have the identical stem - be:

import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

words = ['was', 'is', 'am', 'be']

lemmatizer = nltk.stem.WordNetLemmatizer()


for word in words:
lemma = lemmatizer.lemmatize(word, 'v') # n = noun, v =verb, a = adjective, r = adverbs, s = satellite adjectives
print(lemma)

Lemmatizing Flow Text

To work with full sentences we first need to tonkenize the sentence - breaking it up into single words. And also apply the word definition (noun, verb, etc.) dynamically.

import nltk

# only run once
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

text = 'Experiments on mice at Boston University have spotlighted an ambiguous U.S. policy for research on potentially dangerous pathogens.'
# break text up into single words toLowerCase
tokens = nltk.word_tokenize(text.lower())
# get the pos tag for each token (check if it is verb, noun, etc.)
tags = nltk.pos_tag(tokens)

# print(tags)
# [('experiments', 'NNS'), ('on', 'IN'), ('mice', 'NNS'), ('at', 'IN'), ('boston', 'NN'), ('university', 'NN'), ('have', 'VBP'), ('spotlighted', 'VBN'), ('an', 'DT'), ('ambiguous', 'JJ'), ('u.s.', 'NN'), ('policy', 'NN'), ('for', 'IN'), ('research', 'NN'), ('on', 'IN'), ('potentially', 'RB'), ('dangerous', 'JJ'), ('pathogens', 'NNS'), ('.', '.')]

lemmatizer = nltk.stem.WordNetLemmatizer()
text_lemmas = []

for token, tag in zip (tokens, tags):
# extract part-of-speach tag
tag_pos = tag[1][0].lower()
# print(token, tag_pos)
# exclude prepositions, articles, etc
if tag_pos in ['n', 'v', 'a', 'r']:
lemma = lemmatizer.lemmatize(token, tag_pos)
text_lemmas.append(lemma)


print(text_lemmas)
# ['experiment', 'mouse', 'boston', 'university', 'have', 'spotlight', 'u.s.', 'policy', 'research', 'potentially', 'pathogen']

Compare Word Stems

import nltk

# only run once
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

text1 = 'The Experiments on mice...'
text2 = 'The Experiment on a mouse...'

lemmatizer = nltk.stem.WordNetLemmatizer()

def get_lemmas(text):
# break text up into single words toLowerCase
tokens = nltk.word_tokenize(text.lower())
tags = nltk.pos_tag(tokens)

text_lemmas = []

for token, tag in zip (tokens, tags):
# extract part-of-speech tag
tag_pos = tag[1][0].lower()
# exclude prepositions, articles, etc
if tag_pos in ['n', 'v', 'a', 'r']:
lemma = lemmatizer.lemmatize(token, tag_pos)
text_lemmas.append(lemma)

# print(text_lemmas)
return text_lemmas

source1 = get_lemmas(text1)
source2 = get_lemmas(text2)


print(source1 == source2)
# True

Similarity Coefficient

Compare a set of sentences to a query string and return the string with the highest similarity:

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

lemmatizer = nltk.stem.WordNetLemmatizer()

# only run once
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

json_response = { "articles": [
{ "id": "432rsde34t",
"title": "Some Title",
"abstract": "Experiments on mice at Boston University have spotlighted an ambiguous U.S. policy for research on potentially dangerous pathogens.",
"author": "Some Author"},
{ "id": "67Gfdhnd4",
"title": "Some Title",
"abstract": "The move puts President Biden’s debt relief plan on hold. The court granted a stay in response to an appeal filed by six Republican-led states.",
"author": "Some Author"},
{ "id": "sHB8679iasd",
"title": "Some Title",
"abstract": "The new Communist Party elite will limit potential resistance to Mr. Xi’s agenda of bolstering security and expanding state sway over the economy.",
"author": "Some Author"},
{ "id": "dhg456wASF",
"title": "Some Title",
"abstract": "When Laurene Powell Jobs unveiled a website dedicated to her husband, many wondered if it could change how influential people burnish their legacies.",
"author": "Some Author"},
{ "id": "gfdh346Nr",
"title": "Some Title",
"abstract": "If former President Trump turns down the drama of testifying, his legal team could mount several constitutional and procedural arguments in court.",
"author": "Some Author"}
]
}

# print(json_response['articles'][0]['abstract'])

text = ""

# extract article abstracts and combine them
for article in json_response['articles']:
# print(article['abstract'])
text = text + article['abstract'] + ' '

# print(text)


# compare articles to the following search query
query = 'University Boston Experiment'

# get list of single sentences out of combined text
sentences = nltk.sent_tokenize(text)
# print(sentences)
# append query sentence to list
sentences.append(query)


def get_lemmas(text):
# break text up into single words toLowerCase
tokens = nltk.word_tokenize(text.lower())
tags = nltk.pos_tag(tokens)

text_lemmas = []

for token, tag in zip (tokens, tags):
# extract part-of-speech tag
tag_pos = tag[1][0].lower()
# exclude prepositions, articles, etc
if tag_pos in ['n', 'v', 'a', 'r']:
lemma = lemmatizer.lemmatize(token, tag_pos)
text_lemmas.append(lemma)

return text_lemmas

# get lemmas out of list of sentences
tv = TfidfVectorizer(tokenizer=get_lemmas)
# generate matrix with weights for each lemma in the given text (how often do they appear)
tf = tv.fit_transform(sentences)

# import pandas as pd
# df = pd.DataFrame(tf.toarray(), columns=tv.get_feature_names_out())
# print(df)
# this returns the matrix of words and their relative weight.
# each row represents a sentence that we fed into the function.
# the last row tf[-1] is the query string
# # agenda appeal argument bolster boston burnish ... university unveil website wonder xi ’
# # 0 0.000000 0.000000 0.000000 0.000000 0.26162 0.000000 ... 0.26162 0.000000 0.000000 0.000000 0.000000 0.000000
# # 1 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 ... 0.00000 0.000000 0.000000 0.000000 0.000000 0.305598
# # 2 0.000000 0.395963 0.000000 0.000000 0.00000 0.000000 ... 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
# # 3 0.263724 0.000000 0.000000 0.263724 0.00000 0.000000 ... 0.00000 0.000000 0.000000 0.000000 0.263724 0.218913
# # 4 0.000000 0.000000 0.000000 0.000000 0.00000 0.288675 ... 0.00000 0.288675 0.288675 0.288675 0.000000 0.000000
# # 5 0.000000 0.000000 0.326545 0.000000 0.00000 0.000000 ... 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
# # 6 0.000000 0.000000 0.000000 0.000000 0.57735 0.000000 ... 0.57735 0.000000 0.000000 0.000000 0.000000 0.000000

# # [7 rows x 59 columns]

# Now we can calculate the relative similarity
# of each sentence to the query string

coefficients = cosine_similarity(tf[-1], tf)

# print(coefficients)
# the result is that the query string matches itself by 100%
# and the next best match is the first sentence
# [[0.4531384 0. 0. 0. 0. 0. 1.]]

# now we can sort that list and extract the matching sentence
# index = coefficients.argsort()[0]
# the result is a nested list use zero index or flatten() to extract
# print(index)

# 6 represents the query string and 0 is the position of the best match
# [1 2 3 4 5 0 6]

# so we need to extract the second to last
index = coefficients.argsort().flatten()[-2]

# we can use the index to get the sentence with the best match
print(sentences[index])

# Result:
# Experiments on mice at Boston University have spotlighted an ambiguous U.S. policy for research on potentially dangerous pathogens.

Sentiment Analysis

Analyze the overall sentiment towards a product based on it's customer reviews:

from nltk.sentiment import SentimentIntensityAnalyzer

# run once
# import nltk
# nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()

json_response = { "reviews": [
{ "id": "432rsde34t",
"title": "Some Title",
"text": "I regret this purchase... Only buy if you enjoy a keyboard that goes to sleep even when hardwired and takes like 5 seconds to wake up. This means copy and paste will sometimes fail on the copy because keyboard was sleeping or you think you are typing but you are not lol. So if you throw in the fact that you will get random stuck keys and sometimes profile changes what you actually have is just an annoying keyboard. I have never done so many updates on a review, this is what a mistake it is to purchase this hardware. And take it from me, someone who has a lot of razer products and all of which have similar issues... look else where, Razer is not the company you remember, very disappointing.",
"author": "Some Author"},
{ "id": "67Gfdhnd4",
"title": "Some Title",
"text": "I do have a couple of complaints, though. The software is a bit slow and bloated and seems to slow down my startup time on my computer. I am also having issues with the volume wheel - sometimes it will scroll web pages for some reason and does weird things like jump from 50 to 100 or turn up when i turn down. I also wish it was a bit cheaper, but for my requirements, I did not really have much choice.",
"author": "Some Author"},
{ "id": "sHB8679iasd",
"title": "Some Title",
"text": "I purchased the full length Halo Infinite edition with green switches for the tactile clickiness. Needed a second keyboard for work, thought why not try a smaller form factor from the same product line with the same green switches. I am guessing it is from the phantom keycaps, but the green switches are somewhat muffled and mushy in this form factor. I am 50/50 on the phantom edition keycaps for this form factor. I am new to it, so where my muscle memory puts the keys is incorrect. And without static backlighting set, I feel lost while trying to blaze through a long work email. I decided to use my full length version for work and this for my personal gaming desktop since I mainly use a controller or a Logitech G13 when controllers are not supported.",
"author": "Some Author"},
{ "id": "dhg456wASF",
"title": "Some Title",
"text": "I was REALLY enjoying this keyboard (and my wife was enjoying hers too) - however, both keyboard developped an extra keystroke when typing. This lead to extra letters being inputted, which made the keyboard unuseable. Its a shame, because for the price it was a great keyboard. It also fit really well into our Razer ecosystem AND had amazing battery life with the RGB on. Alas, the keystroke issue is too big a hassle to try again, sadly.",
"author": "Some Author"},
{ "id": "gfdh346Nr",
"title": "Some Title",
"text": "This keyboard is excellent, but only if you mod this keyboard, like add some foam, lube the switches, add some painters tape, bandaid mod the stabs and lube them, then switch for some Durocks (cus razer is trying to be cool again and make their own, which quite frankly it's the same as plate mount stabs unlike the huntsman mini) so for the price of this keyboard it is good when amazon discount it but in general for 200$ Nah pass (but I got a deal for only 126$). I kind of get where they use premium metal, good lithium battery, two signal receivers, three battery cutoff the PCB, the power monitor and distribute board with, and the battery integrated itself but still, only if people knew this. They might appreciate it a little more. Still, the transparent bottom plastic with the glowing razer logo scratches way too quickly; that is all I have to say about this keyboard.",
"author": "Some Author"}
]
}

polarity_scores = []

# extract reviews text and combine them
for review in json_response['reviews']:
score = analyzer.polarity_scores(review['text'])
polarity_scores.append(score)

# print(polarity_scores)
# [{'neg': 0.149, 'neu': 0.823, 'pos': 0.027, 'compound': -0.953}, {'neg': 0.042, 'neu': 0.91, 'pos': 0.047, 'compound': 0.1027}, {'neg': 0.042, 'neu': 0.958, 'pos': 0.0, 'compound': -0.659}, {'neg': 0.085, 'neu': 0.683, 'pos': 0.231, 'compound': 0.9397}, {'neg': 0.053, 'neu': 0.822, 'pos': 0.125, 'compound': 0.9109}]

# neg = negativity score
# neu = neutrality score
# pos = positivity score
# compound = sentiment score can range from -1 to 1. closer to 1 = more positive

sentiment_sum = 0

for compound_sentiment in polarity_scores:
# print(compound_sentiment['compound'])
sentiment_sum = sentiment_sum + float(compound_sentiment['compound'])

average_sentiment = sentiment_sum / len(polarity_scores)

# print(average_sentiment)
# # 0.06825999999999999

if average_sentiment > 1.3:
print('INFO :: The average sentiment is POSITIVE')
elif average_sentiment < -0.3:
print('INFO :: The average sentiment is NEGATIVE')
else:
print('INFO :: The average sentiment is NEUTRAL')


# INFO :: The average sentiment is NEUTRAL

Online Search (Wikipedia)

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import wikipedia

lemmatizer = nltk.stem.WordNetLemmatizer()

# only run once
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

wiki = wikipedia.page('List of Game of Thrones characters', auto_suggest=False).content

# get list of single sentences out of combined text
response = nltk.sent_tokenize(wiki)


def get_lemmas(text):
# break text up into single words toLowerCase
tokens = nltk.word_tokenize(text.lower())
tags = nltk.pos_tag(tokens)

text_lemmas = []

for token, tag in zip (tokens, tags):
# extract part-of-speech tag
tag_pos = tag[1][0].lower()
# exclude prepositions, articles, etc
if tag_pos in ['n', 'v', 'a', 'r']:
lemma = lemmatizer.lemmatize(token, tag_pos)
text_lemmas.append(lemma)

return text_lemmas


def find_similarity(response, query):
# get lemmas out of list of sentences
tv = TfidfVectorizer(tokenizer=get_lemmas)
# generate matrix with weights for each lemma in the given text (how often do they appear)
tf = tv.fit_transform(response)
# Now we can calculate the relative similarity
# of each sentence to the query string
coefficients = cosine_similarity(tf[-1], tf)
# so we need to extract the second to last
index = coefficients.argsort().flatten()[-2]
score = coefficients.flatten()[index]
if score > 0:
return ':: RESULT :: ' + response[index] + ' :: SCORE :: ' + str(score) + ' ::'
else:
return ':: INFO :: No Match Found'


while True:
query = input(':: Query Input:: ')
if query == 'quit':
print(':: INFO :: Shutting down...')
quit()
else:
response.append(query)
output = find_similarity(response=response, query=query)
print(output)