Skip to main content

Python - Text Processing with

Sham Sui Po, Hong Kong

Write Text Files

content = """
Spicy jalapeno bacon ipsum dolor amet pancetta labore ribeye sirloin buffalo adipisicing pig short ribs ipsum aliquip pork excepteur ullamco minim. Ipsum officia beef ribs chuck pork chop picanha salami bacon. Hamburger shoulder biltong irure. Laborum nulla ut chuck ball tip rump chislic burgdoggen jerky pork belly irure short loin tri-tip leberkas. Chicken commodo salami swine in laborum corned beef picanha rump in labore.

> Labore fugiat pork ea ball tip. In laboris venison nulla turducken, short loin short ribs ullamco beef culpa incididunt minim proident. Kielbasa dolore eu aliquip sausage kevin picanha magna id lorem. Adipisicing pastrami chuck salami minim meatloaf, ribeye duis turducken shoulder et ham hock. Alcatra tempor enim burgdoggen pastrami adipisicing dolore swine chuck id cupidatat tenderloin meatloaf cow eu.
"""


with open('write2file.md', 'w') as file:
file.write('# Bacon Ipsum\n')
file.write(content)

Read Text Files

with open('write2file.md', 'r') as file:
content = file.read()

print(content)

Edit Text Files

Remove Trailing

Remove content from a text file:

with open('editfile.csv', 'r') as file:
content = file.read()

# print(content)
# slice off trailing ;END
print(content[:-4])

cleaned_content = content[:-4]

with open('editfile_cleaned.csv', 'w') as file:
file.write(cleaned_content)

Edit & Merge multiple

Remove content from multiple text files and merge into a single file:

from pathlib import Path

source = Path('editfiles/raw')
destination = 'editfiles/edited/editfile_merged.csv'
header = 'Login email;Identifier;One-time password;Recovery code;First name;Last name;Department;Location'

# create merge file and add header
with open(destination, 'w') as file:
file.write(header + "\n")

# get file path of raw files
for filepath in source.iterdir():
with open(filepath, 'r') as file:
# read files in source one by one
content = file.read()
# remove ;END
cleaned_content = content[:-4]
print(cleaned_content)
with open(destination, 'a') as file:
# append read content to destination file
file.write(cleaned_content + "\n")

Replace Strings

Replacing all instances of a word within multiple files and merge them:

from pathlib import Path

source = Path('editfiles/raw')
destination = 'editfiles/edited/editfile_replace.csv'
header = 'Login email;Identifier;One-time password;Recovery code;First name;Last name;Department;Location'

# create merge file and add header
with open(destination, 'w') as file:
file.write(header + "\n")

# get file path of raw files
for filepath in source.iterdir():
with open(filepath, 'r') as file:
# read files in source one by one
content = file.read()
# remove ;END
cleaned_content = content[:-4]
# replace a string
replaced_content = content.replace('mary@example.com', 'maryj@example.br')
with open(destination, 'a') as file:
# append read content to destination file
file.write(replaced_content + "\n")

Remove Header before Merging

When merging multiple CSV files we need to remove the from all but the first file. The readline() function breaks up text with line breaks and appends every line into a list in which the item at position zero is our header:

from pathlib import Path

source = Path('editfiles/raw_header')
destination = 'editfiles/edited/editfile_merged_wHeader.csv'
merged = ''

# loop over file path of raw files
for index, filepath in enumerate(source.iterdir()):
with open(filepath, 'r') as file:
# read lines and write to list
content = file.readlines()
# remove line containing header
content_woHeader = content[1:]
# but keep the first header
if index == 0:
# use join to turn list into string
merged = merged + ''.join(content) + '\n'
else:
merged = merged + ''.join(content_woHeader) + '\n'

with open(destination, 'w') as file:
# write read content to destination file
file.write(merged)

Replace Header after Merging

source = 'editfiles/edited/editfile_merged_wHeader.csv'
destination = 'editfiles/edited/editfile_merged_modified_Header.csv'

# read lines into list
with open(source, 'r') as file:
content = file.readlines()

# take first list item and replace
content[0] = 'Email;ID;Password;Recovery;Name;Family;Department;Location' + '\n'

# write to file
with open(destination, 'w') as file:
file.writelines(content)