Python text extractor & organizer

Python text extractor & organizer - python

I need to extract details of some costumers and save it in a new database all I have its only a txt file so we are talking about 5000 costumers or more that txt file its saved all in this way:
first and last name
NAME SURNAME
zip country n. phone number mobile
United Kingdom +1111111111
e-mail
email#email.email
guest first and last name 1°
NAME SURNAME
guest first and last name 2°
NAME SURNAME
name address city province
NAME SURNAME London London
zip
AAAAA
Cancellation of the reservation.
Since the file is always like this I was thinking there could be a way to scrape so I did some research as far, this is what I have came up with but not really what I need:
with open('input.txt') as infile, open('output.txt', 'w') as outfile:
copy = False
for line in infile:
if (line.find("first and last name") != -1):
copy = True
elif (line.find("Cancellation of the reservation.") != -1):
copy = False
elif copy:
outfile.write(line)
The codes works but simply reads the file from a line to other and copies the content I need something that will copy the content in an other format like this I am able to uploaded on the database the format I need is this:
first and last name | zip country n. phone number mobile|e-mail|guest first and last name 1°|name address city province|zip
So in this case I need it like this:
NAME SURNAME | United Kingdom +1111111111|email#email.email|NAME SURNAME London London |AAAAA
For every line in the output.txt

these are some good scraping tools for what you're looking to do:
data = '''first and last name
NAME SURNAME
zip country n. phone number mobile
United Kingdom +1111111111
e-mail
email#email.email
guest first and last name 1
NAME SURNAME
guest first and last name 2
NAME SURNAME
name address city province
NAME SURNAME London London
zip
AAAAA
Cancellation of the reservation.
'''
# split on space, convert to list
ldata = data.split()
# strip leading and trailing white space from each item
ldata = [i.strip() for i in ldata]
# split on line break, convert to list
ndata = data.split('\n')
ndata = [i.strip() for i in ndata]
#convert list to string
sdata = ' '.join(ldata)
print ldata
print ndata
print sdata
# two examples of split after, split before
name_surname = sdata.split('first and last name')[1].split('zip')[0]
print name_surname
country_phone = sdata.split('mobile')[1].split('e-mail')[0]
print country_phone
>>>
['first', 'and', 'last', 'name', 'NAME', 'SURNAME', 'zip', 'country', 'n.', 'phone', 'number', 'mobile', 'United', 'Kingdom', '+1111111111', 'e-mail', 'email#email.email', 'guest', 'first', 'and', 'last', 'name', '1', 'NAME', 'SURNAME', 'guest', 'first', 'and', 'last', 'name', '2', 'NAME', 'SURNAME', 'name', 'address', 'city', 'province', 'NAME', 'SURNAME', 'London', 'London', 'zip', 'AAAAA', 'Cancellation', 'of', 'the', 'reservation.']
['first and last name', 'NAME SURNAME', 'zip country n. phone number mobile', 'United Kingdom +1111111111', 'e-mail', 'email#email.email', 'guest first and last name 1', 'NAME SURNAME', 'guest first and last name 2', 'NAME SURNAME', 'name address city province', 'NAME SURNAME London London', 'zip', 'AAAAA', 'Cancellation of the reservation.', '']
first and last name NAME SURNAME zip country n. phone number mobile United Kingdom +1111111111 e-mail email#email.email guest first and last name 1 NAME SURNAME guest first and last name 2 NAME SURNAME name address city province NAME SURNAME London London zip AAAAA Cancellation of the reservation.
NAME SURNAME
United Kingdom +1111111111

Related

Using regular expression to remove commonly used company suffixes from a list of companies

I have the following code that I use to generate a list of common company suffixes below:
import re
from cleanco import typesources,
import string
def generate_common_suffixes():
unique_items = []
company_suffixes_raw = typesources()
for item in company_suffixes_raw:
for i in item:
if i.lower() not in unique_items:
unique_items.append(i.lower())
unique_items.extend(['holding'])
return unique_items
I'm then trying to use the following code to remove those suffixes from a list of company names
company_name = ['SAMSUNG ÃŠLECTRONICS Holding, LTD', 'Apple inc',
'FIIG Securities Limited Asset Management Arm',
'First Eagle Alternative Credit, LLC', 'Global Credit
Investments','Seatown', 'Sona Asset Management']
suffixes = generate_common_suffixes()
cleaned_names = []
for company in company_name:
for suffix in suffixes:
new = re.sub(r'\b{}\b'.format(re.escape(suffix)), '', company)
cleaned_names.append(new)
I keep getting a list of unchanged company names despite knowing that the suffixes are there.
Alternate Attempt
I've also tried an alternate method where I'd look for the word and replace it without regex, but i couldn't figure out why it was removing parts of the company name itself - for example, it would remove the first 3 letters in Samsung
for word in common_words:
name = name.replace(word, "")
Any help is greatly appreciated!

import unicodedata
from cleanco import basename
import re
company_names = ['SAMSUNG ÃŠLECTRONICS Holding, LTD',
'Apple inc',
'FIIG Securities Limited Asset Management Arm',
'First Eagle Alternative Credit, LLC',
'Global Credit Investments',
'Seatown',
'Sona Asset Management']
suffix = ["holding"] # "Common words"? You can add more
cleaned_names = []
for company_name in company_names:
# To Lower
company_name = company_name.lower()
# Fix unicode
company_name = unicodedata.normalize('NFKD', company_name).encode('ASCII', 'ignore').decode()
# Remove punctuation
company_name = re.sub(r'[^\w\s]', '', company_name)
# Remove suffixes
company_name = basename(company_name)
# Remove common words
for word in suffix:
company_name = re.sub(fr"\b{word}\b", '', company_name)
# Save
cleaned_names.append(company_name)
print(cleaned_names)
Ouput:
['samsung aalectronics ', 'apple', 'fiig securities limited asset management arm', 'first eagle alternative credit', 'global credit investments', 'seatown', 'sona asset management']

How do I transform a non-CSV text file into a CSV using Python/Pandas?

I have a text file that looks like this:
Id Number: 12345678
Location: 1234561791234567090-8.9
Street: 999 Street AVE
Buyer: john doe
Id Number: 12345688
Location: 3582561791254567090-8.9
Street: 123 Street AVE
Buyer: Jane doe # buyer % LLC
Id Number: 12345689
Location: 8542561791254567090-8.9
Street: 854 Street AVE
Buyer: Jake and Bob: Owner%LLC: Inc
I'd like the file to look like this:
Id Number
Location
Street
Buyer
12345678
1234561791234567090-8.9
999 Street AVE
john doe
12345688
3582561791254567090-8.9
123 Street AVE
Jane doe # buyer % LLC
12345689
8542561791254567090-8.9
854 Street AVE
Jake and Bob: Owner%LLC: Inc
I have tried the following:
# 1 Read text file and ignore bad lines (lines with extra colons thus reading as extra fields).
tr = pd.read_csv('C:\\File Path\\test.txt', sep=':', header=None, error_bad_lines=False)
# 2 Convert into a dataframe/pivot table.
ndf = pd.DataFrame(tr.pivot(index=None, columns=0, values=1))
# 3 Clean up the pivot table to remove NaNs and reset the index (line by line).
nf2 = ndf.apply(lambda x: x.dropna().reset_index(drop=True))
Here is where got the last line (#3): https://stackoverflow.com/a/62481057/10448224
When I do the above and export to CSV the headers are arranged like the following:
(index)
Street
Buyer
Id Number
Location
The data is filled in nicely but at some point the Buyer field becomes inaccurate but the rest of the fields are accurate through the entire DF.
My guesses:
When I run #1 part of my script I get the following errors 507 times:
b'Skipping line 500: expected 2 fields, saw 3\nSkipping line 728: expected 2 fields, saw 3\
At the tail end of the new DF I am missing exactly 507 entries for the Byer field. So I think when I drop my bad lines, the field is pushing my data up.
Pain Points:
The Buyer field will sometimes have extra colons and other odd characters. So when I try to use a colon as a delimiter I run into problems.
I am new to Python and I am very new to using functions. I primarily use Pandas to manipulate data at a somewhat basic level. So in the words of the great Michael Scott: "Explain it to me like I'm five." Many many thanks to anyone willing to help.

Here's what I meant by reading in and using split. Very similar to other answers. Untested and I don't recall if inputline include eol, so I stripped it too.
with open('myfile.txt') as f:
data = [] # holds database
record = {} # holds built up record
for inputline in f:
key,value = inputline.strip().split(':',1)
if key == "Id Number": # new record starting
if len(record):
data.append(record) # write previous record
record = {}
record.update({key:value})
if len(record):
data.append(record) # out final record
df = pd.DataFrame(data)

This is a minimal example that demonstrates the basics:
cat split_test.txt
Id Number: 12345678
Location: 1234561791234567090-8.9
Street: 999 Street AVE
Buyer: john doe
Id Number: 12345688
Location: 3582561791254567090-8.9
Street: 123 Street AVE
Buyer: Jane doe # buyer % LLC
Id Number: 12345689
Location: 8542561791254567090-8.9
Street: 854 Street AVE
Buyer: Jake and Bob: Owner%LLC: Inc
import csv
with open("split_test.txt", "r") as f:
id_val = "Id Number"
list_var = []
for line in f:
split_line = line.strip().split(':')
print(split_line)
if split_line[0] == id_val:
d = {}
d[split_line[0]] = split_line[1]
list_var.append(d)
else:
d.update({split_line[0]: split_line[1]})
list_var
[{'Id Number': ' 12345689',
'Location': ' 8542561791254567090-8.9',
'Street': ' 854 Street AVE',
'Buyer': ' Jake and Bob'},
{'Id Number': ' 12345678',
'Location': ' 1234561791234567090-8.9',
'Street': ' 999 Street AVE',
'Buyer': ' john doe'},
{'Id Number': ' 12345688',
'Location': ' 3582561791254567090-8.9',
'Street': ' 123 Street AVE',
'Buyer': ' Jane doe # buyer % LLC'}]
with open("split_ex.csv", "w") as csv_file:
field_names = list_var[0].keys()
csv_writer = csv.DictWriter(csv_file, fieldnames=field_names)
csv_writer.writeheader()
for row in list_var:
csv_writer.writerow(row)

I would try reading the file line by line, splitting the key-value pairs into a list of dicts to look something like:
data = [
{
"Id Number": 12345678,
"Location": 1234561791234567090-8.9,
...
},
{
"Id Number": ...
}
]
# easy to create the dataframe from here
your_df = pd.DataFrame(data)

how can i get yaml format with input in python ??(without using pyyaml library)

I want to get data from input that is in yaml format.The data includes user information and music albums information that each user has purchased.Input information is as follows:
2 # this line specify the number of users
- name: user1
age: 18
city: city1
albums:
- album1
- album2
- album3
- name: user2
age: 20
city: city2
albums:
- album2
- album1
- alubm3
3 # this line specify the number of albums
- name: album1
singer: singer1
genre: classic
tracks: 10
- name: album2
singer: singer2
genre: pop
tracks: 22
- name: album3
singer: singer3
genre: pop
tracks: 14
I wrote the following code for this
num_user = int(input())
users_data = {}
albums_data = {}
for i in range(num_user):
name, age, city = input().split()[-1], input().split()[-1], input().split()[-1]
input()
albums=[]
next_line = input()
while next_line.split()[0]=='-' and len(next_line)-len(next_line.lstrip(' '))==4:
albums.append(next_line.split()[-1])
next_line = input()
if len(next_line.split()) < 2:
num_albums = int(next_line)
users_data[name]=[age, city, albums]
for i in range(num_albums):
name, singer, genre, tracks = input().split()[-1],input().split()[-1],\
input().split()[-1], input().split()[-1]
albums_data[name]=[singer, genre, tracks]
Everything is in order until the number of users exceeds one person and I have trouble storing the second user information in the dictionary and all the moving information is stored.
I want this:
{'user1': ['18', 'city1', ['album1', 'album2', 'album3']], 'user2': ['20', 'city2', ['album2', 'album1', 'alubm3']]} {'album1': ['singer1', 'classic', '10'], 'album2': ['beeptunes', 'pop', '22'], 'tekunbede': ['beeptunes', 'pop', '14']}
but get this:
{'user1': ['18', 'city1', ['album1', 'album2', 'album3']], '20': ['city2', 'albums:', ['album1', 'alubm3']]} {'album1': ['singer1', 'classic', '10'], 'album2': ['beeptunes', 'pop', '22'], 'tekunbede': ['beeptunes', 'pop', '14']}

The issue seems to be that once you have processed the last album for the first user you are then calling input() again which is getting the name. Decoupling the input from the processing will help to fix the issue so have a look at creating a function to process a name once its been detected.
so try:
read the input
work out what do based on the input
process the read data
num_user = int(input())
users_data = {}
albums_data = {}
for i in range(num_user):
name, age, city = input().split()[-1], input().split()[-1], input().split()[-1]
input()
albums=[]
next_line = input()
while next_line.split()[0]=='-' and len(next_line)-len(next_line.lstrip(' '))==4:
albums.append(next_line.split()[-1])
next_line = input() # This is the line with the issue
if len(next_line.split()) < 2:
num_albums = int(next_line)
users_data[name]=[age, city, albums]
for i in range(num_albums):
name, singer, genre, tracks = input().split()[-1],input().split()[-1],\
input().split()[-1], input().split()[-1]
albums_data[name]=[singer, genre, tracks]

Grab one or two words IF capitalised after a pattern and match the result with another list

I need to extract unique names with titles such as Lord|Baroness|Lady|Baron from text and match it with another list. I struggle to get the right result and hope the community can help me. Thanks!
import re
def get_names(text):
# find nobel titles and grab it with the following name
match = re.compile(r'(Lord|Baroness|Lady|Baron) ([A-Z][a-z]+) ([A-Z][a-z]+)')
names = list(set(match.findall(text)))
# remove duplicates based on the index in tuples
names_ = list(dict((v[1],v) for v in sorted(names, key=lambda names: names[0])).values())
names_lst = list(set([' '.join(map(str, name)) for name in names_]))
return names_lst
text = 'Baroness Firstname Surname and Baroness who is also known as Lady Anothername and Lady Surname or Lady Firstname.'
names_lst = get_names(text)
print(names_lst)
Which now yields:['Baroness Firstname Surname']
Desired output: ['Baroness Firstname Surname', 'Lady Anothername'] but NOT Lady Surname or Lady Firstname
Then I need to match the result with this list:
other_names = ['Firstname Surname', 'James', 'Simon Smith']
and drop the element 'Firstname Surname' from it because it matches the first name and surname of the Baroness in 'the desired output'.

I suggest you the following solution:
import re
def get_names(text):
# find nobel titles and grab it with the following name
match = re.compile(r'(Lord|Baroness|Lady|Baron) ([A-Z][a-z]+)[ ]?([A-Z][a-z]+)?')
names = list(match.findall(text))
# keep only the first title encountered
d = {}
for name in names:
if name[0] not in d:
d[name[0]] = ' '.join(name[1:3]).strip()
return d
text = 'Baroness Firstname Surname and Baroness who is also known as Lady Anothername and Lady Surname or Lady Firstname.'
other_names = ['Firstname Surname', 'James', 'Simon Smith']
names_dict = get_names(text)
print(names_dict)
# {'Baroness': 'Firstname Surname', 'Lady': 'Anothername'}
print([' '.join([k,v]) for k,v in names_dict.items()])
# ['Baroness Firstname Surname', 'Lady Anothername']
other_names_dropped = [name for name in other_names if name not in names_dict.values()]
print(other_names_dropped)
# ['James', 'Simon Smith']

comparing two lists and searching by a field, Python

I have two files I wish to compare and then produce a specific output:
1) Below are the contents of the username text file (this stores the latest films viewed by the user)
Sci-Fi,Out of the Silent Planet
Sci-Fi,Solaris
Romance, When Harry met Sally
2) Below are the contents of the films.txt file which stores all the films in the program that are available to the user
0,Genre, Title, Rating, Likes
1,Sci-Fi,Out of the Silent Planet, PG,3
2,Sci-Fi,Solaris, PG,0
3,Sci-Fi,Star Trek, PG,0
4,Sci-Fi,Cosmos, PG,0
5,Drama, The English Patient, 15,0
6,Drama, Benhur, PG,0
7,Drama, The Pursuit of Happiness, 12, 0
8,Drama, The Thin Red Line, 18,0
9,Romance, When Harry met Sally, 12, 0
10,Romance, You've got mail, 12, 0
11,Romance, Last Tango in Paris, 18, 0
12,Romance, Casablanca, 12, 0
An example of the output I require: The user has currently viewed two sci-fi and one Romance film. The output therefore should SEARCH the Films text file by Genre (identifying SCI-FI and ROMANCE), and should list the films in the films.txt file which have NOT been viewed by the user yet. In this case
3,Sci-Fi,Star Trek, PG,0
4,Sci-Fi,Cosmos, PG,0
10,Romance, You've got mail, 12, 0
11,Romance, Last Tango in Paris, 18, 0
12,Romance, Casablanca, 12, 0
I have the following code which attempts to do the above, but the output it produces is incorrect:
def viewrecs(username):
#set the username variable to the text file -to use it in the next bit
username = (username + ".txt")
#open the username file that stores latest viewings
with open(username,"r") as f:
#open the csv file reader for the username file
fReader=csv.reader(f)
#for each row in the fReader
for row in fReader:
#set the genre variable to the row[0], in which row[0] is all the genres (column 1 in username file)
genre=row[0]
#next, open the films file
with open("films.txt","r") as films:
#open the csv reader for this file (filmsReader as opposed to fReader)
filmsReader=csv.reader(films)
#for each row in the films file
for row in filmsReader:
#and for each field in the row
for field in row:
#print(field)
#print(genre)
#print(field[0])
if genre in field and row[2] not in fReader:
print(row)
Output (undesired):
['1', 'Sci-Fi', 'Out of the Silent Planet', ' PG', '3']
['2', 'Sci-Fi', 'Solaris', ' PG', '0']
['3', 'Sci-Fi', 'Star Trek', ' PG', '0']
['4', 'Sci-Fi', 'Cosmos', ' PG', '0']
I don't want a re-write or new solution, but, preferably, a fix to the above solution with its logical progression ...
#gipsy - your solution appears to have nearly worked. I used:
def viewrecs(username):
#set the username variable to the text file -to use it in the next bit
username = (username + ".txt")
#open the username file that stores latest viewings
lookup_set = set()
with open(username,"r") as f:
#open the csv file reader for the username file
fReader=csv.reader(f)
#for each row in the fReader
for row in fReader:
genre = row[1]
name = row[2]
lookup_set.add('%s-%s' % (genre, name))
with open("films.txt","r") as films:
filmsReader=csv.reader(films)
#for each row in the films file
for row in filmsReader:
genre = row[1]
name = row[2]
lookup_key = '%s-%s' % (genre, name)
if lookup_key not in lookup_set:
print(row)
The output is as below: It is printing ALL the lines in allfilms that are not in the first set, rather than just the ones based on the GENRE in the first set:
['0', 'Genre', ' Title', ' Rating', ' Likes']
['3', 'Sci-Fi', 'Star Trek', ' PG', ' 0']
['4', 'Sci-Fi', 'Cosmos', ' PG', ' 0']
['5', 'Drama', ' The English Patient', ' 15', ' 0']
['6', 'Drama', ' Benhur', ' PG', ' 0']
['7', 'Drama', ' The Pursuit of Happiness', ' 12', ' 0']
['8', 'Drama', ' The Thin Red Line', ' 18', ' 0']
['10', 'Romance', " You've got mail", ' 12', ' 0']
['11', 'Romance', ' Last Tango in Paris', ' 18', ' 0']
['12', 'Romance', ' Casablanca', ' 12', ' 0']
NOTE: I changed the format of the first set to be the same, for simplicity, of the all films entries:
1,Sci-Fi,Out of the Silent Planet, PG
2,Sci-Fi,Solaris, PG

How about using sets and separate lists to filter movies in appropriate genres that were not seen? We can even abuse the dictionaries' keys and values for this purpose:
def parse_file (file):
return map(lambda x: [w.strip() for w in x.split(',')], open(file).read().split('\n'))
def movies_to_see ():
seen = {film[0]: film[1] for film in parse_file('seen.txt')}
films = parse_file('films.txt')
to_see = []
for film in films:
if film[1] in seen.keys() and film[2] not in seen.values():
to_see.append(film)
return to_see

The solution using str.split() and str.join() functions:
# change file paths with your actual ones
with open('./text_files/user.txt', 'r') as userfile:
viewed = userfile.read().split('\n')
viewed_genders = set(g.split(',')[0] for g in viewed)
with open('./text_files/films.txt', 'r') as filmsfile:
films = filmsfile.read().split('\n')
not_viewed = [f for f in films
if f.split(',')[1] in viewed_genders and ','.join(f.split(',')[1:3]) not in viewed]
print('\n'.join(not_viewed))
The output:
3,Sci-Fi,Star Trek, PG,0
4,Sci-Fi,Cosmos, PG,0
10,Romance, You've got mail, 12, 0
11,Romance, Last Tango in Paris, 18, 0
12,Romance, Casablanca, 12, 0

Okay , build a set going through the first file with Genre + name as the entry.
Now iterate over the second file and lookup in the set you made above for an entry for Genre+ name, if not exists print that out.
Once I am home I can type some code.
As promised my code for this is below:
def viewrecs(username):
#set the username variable to the text file -to use it in the next bit
username = (username + ".txt")
# In this set we will collect the unique combinations of genre and name
genre_name_lookup_set = set()
# In this set we will collect the unique genres
genre_lookup_set = set()
with open(username,"r") as f:
#open the csv file reader for the username file
fReader=csv.reader(f)
#for each row in the fReader
for row in fReader:
genre = row[0]
name = row[1]
# Add the genre name combination to this set, duplicates will be taken care automatically as set won't allow dupes
genre_name_lookup_set.add('%s-%s' % (genre, name))
# Add genre to this set
genre_lookup_set.add(genre)
with open("films.txt","r") as films:
filmsReader=csv.reader(films)
#for each row in the films file
for row in filmsReader:
genre = row[1]
name = row[2]
# Build a lookup key using genre and name, example:Sci-Fi-Solaris
lookup_key = '%s-%s' % (genre, name)
if lookup_key not in genre_name_lookup_set and genre in genre_lookup_set:
print(row)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.