What I'm looking to achieve:
The code added below filters through a parsed HTML page looking for specific values. Each specific value is then added to its own specific list in the form of a dictionary. Once all the values are added to the lists the dictionaries within are then combined into a JSON blob that I can then export.
Note - This is part of a quick PoC, so it was written quick and dirty. Forgive me.
My problem:
When the following lists dictionaries are combined I do not encounter any issues when export the blob:
jobs
names
dates
summaries
However, when the locations list is added in order to be combined into the blob an IndexError exception is encountered. As shown in the image below:
IndexError Encountered
My Analysis:
I've found that sometimes the value is not found because it was not included in the parsed HTML for reason/s that I cannot control, ie. it was not added my the user when it was created. The issue in this case being that the len of the locations list being 14 whilst the len of the other lists being equal at 15 which is causing the IndexError exception when I combine the lists using a for loop.
My Question:
As shown in my code below, I'm trying to handle the issue by assigning a placeholder value, "null", when the scraped value is not found but for some reason the value is not applied and I still encounter the IndexError exception. Any help would be appreciated, thank you in advance.
My Code:
import ast
import sys
# Create empty lists [Global]
jobs = []
names = []
dates = []
summaries = []
locations = []
# Function - Ingest parsed HTML data | Filter out required values
def getJobs(parsedHTML):
# Loop - Get job title
for div in parsedHTML.find_all(name='h2', attrs={'class':'title'}):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
val = str(a.getText().strip())
if val is None:
locations.append({"job-title": "null"})
else:
dictItem = {"job-title": f"{val}"}
jobs.append(dictItem)
# Loop - Get job poster's name
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'company'}):
val = str(span.getText().strip())
if val is None:
locations.append({"company-name": "null"})
else:
dictItem = {"company-name": f"{val}"}
names.append(dictItem)
# Loop - Get the date the job post was created
for div in parsedHTML.find_all(name='div', attrs={'class':'result-link-bar'}):
for span in div.find_all(name='span', attrs={'class':'date date-a11y'}):
val = str(span.getText().strip())
if val is None:
locations.append({"date-created": "null"})
else:
dictItem = {"date-created": f"{val}"}
dates.append(dictItem)
# Loop - Get short job description
for divParent in parsedHTML.find_all(name='div', attrs={'class':'result'}):
for divChild in divParent.find_all(name='div', attrs={'class':'summary'}):
val = str(divChild.getText().strip())
if val is None:
locations.append({"short-description": "null"})
else:
dictItem = {"short-description": f"{val}"}
summaries.append(dictItem)
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'location'}):
val = str(span.getText().strip())
if val is None:
locations.append({"location": "null"})
else:
dictItem = {"location": f"{val}"}
locations.append(dictItem)
# Function - Generate test data
def testData(parsedHTML, typeProc):
# typeProc == True | Export data to text files
if typeProc:
#getJobs(parsedHTML)
with open("jobs.txt", "w") as file:
for line in jobs:
file.write(str(line))
file.write("\n")
file.close()
with open("names.txt", "w") as file:
for line in names:
file.write(str(line))
file.write("\n")
file.close()
with open("dates.txt", "w") as file:
for line in dates:
file.write(str(line))
file.write("\n")
file.close()
with open("summaries.txt", "w") as file:
for line in summaries:
file.write(str(line))
file.write("\n")
file.close()
with open("locations.txt", "w") as file:
for line in locations:
file.write(str(line))
file.write("\n")
file.close()
# typeProc == False | Import data from txt files, convert to dictionary and append to list
elif typeProc == False:
with open("jobs.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
jobs.append(content[i])
file.close()
with open("names.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
names.append(content[i])
file.close()
with open("dates.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
dates.append(content[i])
file.close()
with open("summaries.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
summaries.append(content[i])
file.close()
with open("locations.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
locations.append(content[i])
file.close()
# Else | If this else is hit, something is greatly fvcked
else:
print("Function: testData | Error: if statement else output")
sys.exit(1)
# Function - Remove items from all lists
def wipeLists():
jobs.clear()
names.clear()
dates.clear()
summaries.clear()
locations.clear()
# Function - JSON Blob Generator
def genJSON(parsedHTML):
# Testing with cached local IRL data
#testData(parsedHTML, False)
getJobs(parsedHTML)
jsonBlob = []
# Merge dictionaries | Combining dictionaries into single object + Append to jsonBlob list
for i in range(len(jobs)):
sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i], **locations[i]}
#sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i]}
jsonBlob.append(sumObj)
return jsonBlob
Thank You #pavel for your notes on how to approach the issue. I found that the value I was looking for was actually a required field when it was created and for some reason I was just not getting the correct amount of values when I was filtering the parsed data.
I reviewed the source code of the page/s again and found that there was another field with the exact value I was looking for. So now instead of getting the text of a span-element inside the parent div, I am getting the custom data-* attribute value of the parent div-element. I have not encountered a single error whilst testing.
Updated Code:
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for divChild in div.find_all(name='div', attrs={'class':'recJobLoc'}):
dictItem = {"location": f"{divChild['data-rc-loc']}"}
locations.append(dictItem)
Thank You to everyone who tried to help. This has been resolved.
Related
I'm new in Python and i'm challenging myself by making an online library management with prompt for the 1st phase.I'm stacked in search function.I have found how to print a user's input,but i can't find how to print and the following data.I want to search a book by name.If book's name is in the text,i want to print the details of the book,like author,isbn etc.
Here is the following code i have made:
def search():
search_book = input('Search a book: ')
with open('library.txt', mode='r', encoding='utf-8') as f:
index = 0
for line in f:
index += 1
if search_book in line:
print(f'{search_book} is in line {index}')
for details in range(index,index+5):
print(line[details])
And this is the text file's data:
FIRST
ME
9781234
2000
Science
SECOND
YOU
9791234
1980
Literature
It is separated by new line.As example a user input the name FIRST and the result will be:
FIRST
ME
9781234
2000
Science
There are two file options we can consider,
Csv file - Instead of individual readline, you could use one line for one book entry.
# ---------test.csv -------------
# BookName, ItemCode, Price
# Book1, 00012, 14.55
# Book2, 00232, 55.12
# -----End Csv-------------------
import csv
def read_csv(filename:str):
file_contents = None
# reading csv file
with open(filename, 'r') as csvfile:
file_contents = csv.reader(csvfile)
return file_contents
def search(file_contents, book_name:str):
if not file_contents:
return None
for line in file_contents:
if book_name in line:
return line
if __name__ == '__main__':
file_contents = read_csv('test.csv')
line = search(file_contents, 'ME')
print(line if line else 'No Hit Found')
Json - This is much better option than csv file
import json
def read_json(filename:str) -> dict:
with open(filename) as json_file:
all_books = json.load(json_file)
return all_books
def search(all_books:dict, book_name:str):
for book_id, book_details in all_books.items():
if book_details['Name'] == book_name:
return book_details
return None
if __name__ == '__main__':
all_books = read_json('books.json')
book = search(all_books, 'YOU')
print(book if book else 'Not hit found')
If your file contents can't change, then I will go with #tripleee suggestion above. Good luck.
You are reading a line at a time, and looping over the first line's contents. At this point in the program, there are not yet any additional lines. But a fix is relatively easy:
def search():
search_book = input('Search a book: ')
with open('library.txt', mode='r', encoding='utf-8') as f:
index = 0
print_this_many = 0
for line in f:
index += 1
if search_book in line:
print(f'{search_book} is in line {index}')
print_this_many = 5
if print_this_many:
print(line, end='')
print_this_many -= 1
We don't have the next lines in memory yet, but we can remember how many of them to print as we go ahead and read more of them. The print_this_many variable is used for this: When we see the title we want, we set it to 5 (to specify that this and the next four lines should be printed). Now, each time we read a new line, we check if this variable is positive; if it is, we print the line and decrement the variable. When it reaches zero, we will no longer print the following lines. This allows us to "remember" across iterations of the for loop which reads each new line whether we are in the middle of printing something.
A much better solution is to read the database into memory once, and organize the lines into a dictionary, for example.
def read_lib(filename):
library = dict()
with open(filename) as lib:
title = None
info = []
for line in lib:
line = line.rstrip('\n')
if title is None:
title = line
elif line == '':
if title and info:
library[title] = info
title = None
else:
info.append(line)
def search(title, library):
if title in library:
return library[title]
else:
return None
def main():
my_library = read_lib('library.txt')
while True:
sought = input('Search a book: ')
found = search(sought, my_library)
if found:
print('\n'.join(found))
else:
print('Sorry, no such title in library')
My intention was to copy a piece of string after either a colon or equal sign from File 1 , and pasting that string in File 2 in a similar location after either a colon or equal sign.
For instance, if File 1 has:
username: Stack
File 2 is originally empty:
username=
I want Stack to be copied over to File 2 after username. Currently, I'm stuck and not sure what to do. The program piece I made below doesn't copy the username. I would greatly appreciate any input!
with open("C:/Users/SO//Downloads//f1.txt", "r") as f1:
with open("C:/Users/SO//Downloads//f2.txt", "r+") as f2:
searchlines = f1.readlines()
searchlines_f2=f2.readlines()
for i, line in enumerate(searchlines):
if 'username' in line:
for l in searchlines[i:i+1]:
ind = max(l.find(':'), l.find('='), 0) #finding index of specific characters
copy_string=l[ind+1:].strip() #copying string for file 2
for l in searchlines_f2[i:i+1]:
if 'username' in line:
f2.write(copy_string)
I think something like this will get you what you need in a more maintainable and Pythonic way.
Note the use of regex as well as some string methods (e.g., startswith)
import re
SOURCE_PATH = "C:/Users/SO//Downloads//f1.txt"
TARGET_PATH = "C:/Users/SO//Downloads//f2.txt"
def _get_lines(filepath):
""" read `filepath` and return a list of strings """
with open(filepath, "r+") as fh:
return fh.readlines()
def _get_value(fieldname, text):
""" parse `text` to get the value of `fieldname` """
try:
pattern = '%s[:=]{1}\s?(.*)' % fieldname
return re.match(pattern, text).group(1)
except IndexError:
# you may want to handle this differently!
return None
def _write_target(filepath, trgt_lines):
""" write `trgt_lines` to `filepath` """
with open(filepath, "w+") as fh:
fh.writelines(trgt_lines)
src_lines = _get_lines(SOURCE_PATH)
trgt_lines = _get_lines(TARGET_PATH)
# extract field values from source file
fields = ['username', 'id', 'location']
for field in fields:
value = None
for cur_src in src_lines:
if cur_src.startswith(field):
value = _get_value(field, cur_src)
break
# update target_file w/ value (if we were able to find it)
if value is not None:
for i, cur_trgt in enumerate(trgt_lines):
if cur_trgt.startswith('{0}='.format(field)):
trgt_lines[i] = '{0}={1}'.format(field, value)
break
_write_target(TARGET_PATH, trgt_lines)
I'm trying to write a program by change an open file, and I need to add a new line in the print.
In the open txt.file, it shows like this (I use"_" replace blank):
Name_____Height(m)_____Weight(kg)
Bill________1.58__________58
Mary_____1.65__________43
...
And now I want to add a new row like this:
Name_____Height(m)_____Weight(kg)_____Age(year)<---The new vertical line
Bill________1.58__________58_____________15
Mary_____1.65__________43_____________17
And my code it's:
data_file = open("file.txt", "r")
print(data_file.read())
data_file.close()
So, how could I add another vertical line in the open file? Moreover, If I want to add more rows, how can I do this?
One more thing, I use the python 3.5
I wrote a little class to do everything you asked for and more. Implementation examples are done at the bottom. Let me know if this works for you.
class Feed(object):
def __init__(self, file_name, sep, naming_convention=None):
self.file_name = file_name
self.feed_item_naming = naming_convention
self.sep = sep
self.feed = self.load_feed()
def get_head(self, file=None):#lmao...
'''
Get the header
'''
if not file:
head = open(self.file_name).readline().split(self.sep)
else:
head = file[0].split(self.sep)
return head
def __repr__(self):
return repr(self.feed)
def load_feed(self):
'''
load a feed object
set the key of each item to the naming convention
if we have multiple item names we increment the name bill becomes bill_2 and then bill_3 etc...
'''
#first we open the file and grab the headers
file = [x.rstrip() for x in open(self.file_name).readlines()]
self.header = self.get_head(file)
if not self.feed_item_naming and self.feed_item_naming not in self.header:
self.feed_item_naming = self.header[0]
data = {}
for line in file[1:]:
if line != '':
line = line.split(self.sep)
pos = line[self.header.index(self.feed_item_naming)]
while pos in data:
try:
ending = int(pos[-1])+1
pos.replace(pos[-1], ending)
except:
pos = pos+'_2'
data[pos] = {}
for item in self.header:
data[pos][item] = line[self.header.index(item)]
return data
def unload_feed(self, file_name=None, sep=None):
'''
write the modified feed back out to a data file
'''
if not file_name:
file_name = self.file_name
if not sep:
sep = self.sep
with open(file_name, 'wb') as file:
for i in self.header:
if i != self.header[-1]:
file.write(i+sep)
else:
file.write(i)
file.write('\n')
for i in self.feed:
for x in self.header:
if x != self.header[-1]:
file.write(str(self.feed[i][x])+sep)
else:
file.write(str(self.feed[i][x]))
file.write('\n')
def add_key(self, key, default_value=None):
'''
Add a key to each of the items
'''
if key not in self.header:
for i in self.feed:
self.feed[i][key]=default_value
self.header.append(key)
def get_key_value(self, item, key):
'''
get the value for the items key
'''
return self.feed[item][key]
def get_item(self, item):
'''
get an individual item
'''
return self.feed[item]
def set_key_value(self, item, key, value):
'''
set the value of each items key
{item:{key:value, key:value}, item...etc}
'''
self.feed[item][key] = value
def set_key_values(self, item, key_value_dict):
'''
set multiple key values for an item
'''
for k,v in key_value_dict.iteritems():
self.set_key_value(item, k, v)
def add_item(self, item):
'''
Add a new item
'''
while item in self.feed:
try:
end = str(int(item[-1])+1)
item = item.replace(item[-1], end)
except:
item = item+'_2'
self.feed[item] = {}
self.feed[item][self.feed_item_naming] = item
for i in self.header:
if i != self.feed_item_naming:
self.feed[item][i] = None
f = Feed('file.txt', '_____', 'Name') #initialize a new feed object, make sure that all seperators are the same for each item in your file
f.add_item('Angela') #add a new item
f.set_key_values('Angela', {'Height(m)':5, 'Weight(kg)':123}) #set the new items height and weight
f.add_key('Position')#create a new key for each item
f.unload_feed() #write the feed back to the file
print(f)
If by "add a new vertical line" you mean "add a new column" to your file, you can do this with the help of the csv module.
The code below works by reading the contents of your file as a list, making the changes, and then writing the updated list back to the file. You can add rows to your file this way, as well.
import csv
with open('file.txt', 'r') as f:
reader = list(csv.reader(f, delimiter=' ')) # if your file is delimited by spaces, tabs, etc.
# include that value here. It appears that
# your file is space-delimited, but that's
# just a guess based on the info in your question.
for i,row in enumerate(reader):
if i == 0:
row.append('Age(year)')
if i == 1:
row.append('15')
if i == 2:
row.append('17')
with open('file.txt','w') as f:
wr = csv.writer(f, delimiter=' ')
for row in reader:
wr.writerow(row)
# file.txt output:
# Name Height(m) Weight(kg) Age(year)
# Bill 1.58 58 15
# Mary 1.6 43 17
This code also uses with statements when working with your file. Using either with or close() (like you included in your question) is the correct practice when working with files. A with statement is easy to use because it closes your file automatically.
I'm converting text directly to epub and I'm having a problem automatically splitting the HTML book file into separate header/chapter files. At the moment, the code below partially works but only creates every other chapter file. So half the header/chapter files are missing from the output. Here is the code:
def splitHeaderstoFiles(fpath):
infp = open(fpath, 'rt', encoding=('utf-8'))
for line in infp:
# format and split headers to files
if '<h1' in line:
#-----------format header file names and other stuff ------------#
# create a new file for the header/chapter section
path = os.getcwd() + os.sep + header
with open(path, 'wt', encoding=('utf-8')) as outfp:
# write html top meta headers
outfp = addMetaHeaders(outfp)
# add the header
outfp = outfp.write(line)
# add the chapter/header bodytext
for line in infp:
if '<h1' not in line:
outfp.write(line)
else:
outfp.write('</body>\n</html>')
break
else:
continue
infp.close()
The problem occurs in the second 'for loop' at the bottom of the code, when I look for the next h1 tag to stop the split. I cannot use seek() or tell() to rewind or move back one line so the program can find the next header/chapter on the next iteration. Apparently you cannot use these in python in a for loop containing an implicit iter or next object in operation. Just gives a 'can't do non-zero cur-relative seeks' error.
I've also tried the while line != ' ' + readline() combination in the code which also gives the same error as above.
Does anyone know an easy way to split HTML headers/chapters of varying lengths into separate files in python? Are there any special python modules(such as pickles) that could help make this task easier?
I'm using Python 3.4
My grateful thanks in advance for any solutions to this problem...
I ran into similar problem a while ago, here is a simplified solution:
from itertools import count
chapter_number = count(1)
output_file = open('000-intro.html', 'wb')
with open('index.html', 'rt') as input_file:
for line in input_file:
if '<h1' in line:
output_file.close()
output_file = open('{:03}-chapter'.format(next(chapter_number)), 'wb')
output_file.write(line)
output_file.close()
In this approach, the first block of text leading to the first h1 block is written into 000-intro.html, the first chapter will be written into 001-chapter.html and so on. Please modify it to taste.
The solution is a simple one: Upon encountering the h1 tag, close the last output file and open a new one.
You are looping over your input file twice, which is likely causing your problems:
for line in infp:
...
with open(path, 'wt', encoding=('utf-8')) as outfp:
...
for line in infp:
...
Each for is going to have it's own iterator, so you are going to loop over the file many times.
You might try transforming your for loop into a while so you're not using two different iterators:
while infp:
line = infp.readline()
if '<h1' in line:
with open(...) as outfp:
while infp:
line = infp.readline()
if '<h1' in line:
break
outfp.writeline(...)
Alternatively, you may wish to use an HTML parser (i.e., BeautifulSoup). Then you can do something like what is described here: https://stackoverflow.com/a/8735688/65295.
Update from comment - essentially, read the entire file all at once so you can freely move back or forward as necessary. This probably won't be a performance issue unless you have a really really big file (or very little memory).
lines = infp.readlines() # read the entire file
i = 0
while i < len(lines):
if '<h1' in lines[i]:
with open(...) as outfp:
j = i + 1
while j < len(lines):
if '<h1' in lines[j]:
break
outfp.writeline(lines[j])
# line j has an <h1>, set i to j so we detect the it at the
# top of the next loop iteration.
i = j
else:
i += 1
I eventually found the answer to the above problem. The code below does alot more that just get the file header. It also simultaneously loads two parallel list arrays with formatted file name data(with extension) and pure header name data respectively so I can use these lists to fill in the and formatted filename extension in these html files within a while loop in one hit. The code now works well and is shown below.
def splitHeaderstoFiles(dir, inpath):
count = 1
t_count = 0
out_path = ''
header = ''
write_bodytext = False
file_path_names = []
pure_header_names = []
inpath = dir + os.sep + inpath
with open(inpath, 'rt', encoding=('utf-8')) as infp:
for line in infp:
if '<h1' in line:
#strip html tags, convert to start caps
p = re.compile(r'<.*?>')
header = p.sub('', line)
header = capwords(header)
line_save = header
# Add 0 for count below 10
if count < 10:
header = '0' + str(count) + '_' + header
else:
header = str(count) + '_' + header
# remove all spaces + add extension in header
header = header.replace(' ', '_')
header = header + '.xhtml'
count = count + 1
#create two parallel lists used later
out_path = dir + os.sep + header
outfp = open(out_path, 'wt', encoding=('utf-8'))
file_path_names.insert(t_count, out_path)
pure_header_names.insert(t_count, line_save)
t_count = t_count + 1
# Add html meta headers and write it
outfp = addMainHeaders(outfp)
outfp.write(line)
write_bodytext = True
# add header bodytext
elif write_bodytext == True:
outfp.write(line)
# now add html titles and close the html tails on all files
max_num_files = len(file_path_names)
tmp = dir + os.sep + 'temp1.tmp'
i = 0
while i < max_num_files:
outfp = open(tmp, 'wt', encoding=('utf-8'))
infp = open(file_path_names[i], 'rt', encoding=('utf-8'))
for line in infp:
if '<title>' in line:
line = line.strip(' ')
line = line.replace('<title></title>', '<title>' + pure_header_names[i] + '</title>')
outfp.write(line)
else:
outfp.write(line)
# add the html tail
if '</body>' in line or '</html>' in line:
pass
else:
outfp.write(' </body>' + '\n</html>')
# clean up
infp.close()
outfp.close()
shutil.copy2(tmp, file_path_names[i])
os.remove(tmp)
i = i + 1
# now rename just the title page
if os.path.isfile(file_path_names[0]):
title_page_name = file_path_names[0]
new_title_page_name = dir + os.sep + '01_Title.xhtml'
os.rename(title_page_name, new_title_page_name)
file_path_names[0] = '01_Title.xhtml'
else:
logmsg27(DEBUG_FLAG)
os._exit(0)
# xhtml file is no longer needed
if os.path.isfile(inpath):
os.remove(inpath)
# returned list values are also used
# later to create epub opf and ncx files
return(file_path_names, pure_header_names)
#Hai Vu and #Seth -- Thanks for all your help.
I am having problems deleting a specific line/entry within a text file. With the code I have the top line in the file is deleted no matter what line number I select to delete.
def erase():
contents = {}
f = open('members.txt', 'a')
f.close()
f = open('members.txt', 'r')
index = 0
for line in f:
index = index + 1
contents[index] = line
print ("{0:3d}) {1}".format(index,line))
f.close()
total = index
entry = input("Enter number to be deleted")
f = open('members.txt', 'w')
index = 0
for index in range(1,total):
index = index + 1
if index != entry:
f.write(contents[index])
Try this:
import sys
import os
def erase(file):
assert os.path.isfile(file)
with open(file, 'r') as f:
content = f.read().split("\n")
#print content
entry = input("Enter number to be deleted:")
assert entry >= 0 and entry < len(content)
new_file = content[:entry] + content[entry+1:]
#print new_file
with open(file,'w') as f:
f.write("\n".join(new_file))
if __name__ == '__main__':
erase(sys.argv[1])
As already noted you were starting the range from 1 which is incorrect. List slicing which I used in new_file = content[:entry] + content[entry+1:] makes the code more readable and it is an approach less prone to similar errors.
Also you seem to open and close the input file at the beginning for no reason. Also you should use with if possible when doing operations with files.
Finally I used the join and split to simplify the code so you don't need a for loop to process the lines of the file.