I'm trying to update a program to pull/read 10-K html and am getting a FileNotFound error. The error throws during the readHTML function. It looks like the FileName parameter is looking for a path to the Form10KName column, when it should be looking to the FileName column. I've no idea why this is happening, any help?
Here is the error code:
File "C:/Users/crabtreec/Downloads/4_ReadHTML.py", line 105, in <module>
main()
File "C:/Users/crabtreec/Downloads/4_ReadHTML.py", line 92, in main
match=readHTML(FileName)
File "C:/Users/crabtreec/Downloads/4_ReadHTML.py", line 18, in readHTML
input_file = open(input_path,'r+')
FileNotFoundError: [Errno 2] No such file or directory: './HTML/a10-k20189292018.htm'
And here is what I'm running.
from bs4 import BeautifulSoup #<---- Need to install this package manually using pip
from urllib.request import urlopen
os.chdir('C:/Users/crabtreec/Downloads/') # The location of the file "CompanyList.csv
htmlSubPath = "./HTML/" #<===The subfolder with the 10-K files in HTML format
txtSubPath = "./txt/" #<===The subfolder with the extracted text files
DownloadLogFile = "10kDownloadLog.csv" #a csv file (output of the 3DownloadHTML.py script) with the download history of 10-K forms
ReadLogFile = "10kReadlog.csv" #a csv file (output of the current script) showing whether item 1 is successfully extracted from 10-K forms
def readHTML(FileName):
input_path = htmlSubPath+FileName
output_path = txtSubPath+FileName.replace(".htm",".txt")
input_file = open(input_path,'r+')
page = input_file.read() #<===Read the HTML file into Python
#Pre-processing the html content by removing extra white space and combining then into one line.
page = page.strip() #<=== remove white space at the beginning and end
page = page.replace('\n', ' ') #<===replace the \n (new line) character with space
page = page.replace('\r', '') #<===replace the \r (carriage returns -if you're on windows) with space
page = page.replace(' ', ' ') #<===replace " " (a special character for space in HTML) with space.
page = page.replace(' ', ' ') #<===replace " " (a special character for space in HTML) with space.
while ' ' in page:
page = page.replace(' ', ' ') #<===remove extra space
#Using regular expression to extract texts that match a pattern
#Define pattern for regular expression.
#The following patterns find ITEM 1 and ITEM 1A as diplayed as subtitles
#(.+?) represents everything between the two subtitles
#If you want to extract something else, here is what you should change
#Define a list of potential patterns to find ITEM 1 and ITEM 1A as subtitles
regexs = ('bold;\">\s*Item 1\.(.+?)bold;\">\s*Item 1A\.', #<===pattern 1: with an attribute bold before the item subtitle
'b>\s*Item 1\.(.+?)b>\s*Item 1A\.', #<===pattern 2: with a tag <b> before the item subtitle
'Item 1\.\s*<\/b>(.+?)Item 1A\.\s*<\/b>', #<===pattern 3: with a tag <\b> after the item subtitle
'Item 1\.\s*Business\.\s*<\/b(.+?)Item 1A\.\s*Risk Factors\.\s*<\/b') #<===pattern 4: with a tag <\b> after the item+description subtitle
#Now we try to see if a match can be found...
for regex in regexs:
match = re.search (regex, page, flags=re.IGNORECASE) #<===search for the pattern in HTML using re.search from the re package. Ignore cases.
#If a match exist....
if match:
#Now we have the extracted content still in an HTML format
#We now turn it into a beautiful soup object
#so that we can remove the html tags and only keep the texts
soup = BeautifulSoup(match.group(1), "html.parser") #<=== match.group(1) returns the texts inside the parentheses (.*?)
#soup.text removes the html tags and only keep the texts
rawText = soup.text.encode('utf8') #<=== you have to change the encoding the unicodes
#remove space at the beginning and end and the subtitle "business" at the beginning
#^ matches the beginning of the text
outText = re.sub("^business\s*","",rawText.strip(),flags=re.IGNORECASE)
output_file = open(output_path, "w")
output_file.write(outText)
output_file.close()
break #<=== if a match is found, we break the for loop. Otherwise the for loop continues
input_file.close()
return match
def main():
if not os.path.isdir(txtSubPath): ### <=== keep all texts files in this subfolder
os.makedirs(txtSubPath)
csvFile = open(DownloadLogFile, "r") #<===A csv file with the list of 10k file names (the file should have no header)
csvReader = csv.reader(csvFile, delimiter=",")
csvData = list(csvReader)
logFile = open(ReadLogFile, "a+") #<===A log file to track which file is successfully extracted
logWriter = csv.writer(logFile, quoting = csv.QUOTE_NONNUMERIC)
logWriter.writerow(["filename","extracted"])
i=1
for rowData in csvData[1:]:
if len(rowData):
FileName = rowData[7]
if ".htm" in FileName:
match=readHTML(FileName)
if match:
logWriter.writerow([FileName,"yes"])
else:
logWriter.writerow([FileName,"no"])
i=i+1
csvFile.close()
logFile.close()
print("done!")
if __name__ == "__main__":
main()
CSV of file info
Your error message explains it is not looking inside the "HTML" directory for the file.
I would avoid using os.chdir to change the working directory - it is likely to complicate things. Instead, use pathlib and join paths correctly to ensure file paths are less error prone.
Try with this:
from pathlib import Path
base_dir = Path('C:/Users/crabtreec/Downloads/') # The location of the file "CompanyList.csv
htmlSubPath = base_dir.joinpath("HTML") #<===The subfolder with the 10-K files in HTML format
txtSubPath = base_dir.joinpath("txt") #<===The subfolder with the extracted text files
DownloadLogFile = "10kDownloadLog.csv" #a csv file (output of the 3DownloadHTML.py script) with the download history of 10-K forms
ReadLogFile = "10kReadlog.csv" #a csv file (output of the current script) showing whether item 1 is successfully extracted from 10-K forms
def readHTML(FileName):
input_path = htmlSubPath.joinpath(FileName)
output_path = txtSubPath.joinpath(FileName.replace(".htm",".txt"))
Related
Just trying to work out how to mark something as a fail in pypdf2 if there is match on any page of a PDF doc. I have been using the below code which I have partly recycled and partly built.
Problem is that is prints fail for every single line which I don't need. I am trying to change it to only print Fail once if there are no matches on any page.
import PyPDF2
import re
import os
#create filereader object to read the PDF using PyPDF2
object = PyPDF2.PdfFileReader("shopping.pdf")
NumPages = object.getNumPages()
print(f"This document has {NumPages} pages")
for i in range(0, NumPages):
page = object.getPage(i)
text = page.extractText()
for line in text.splitlines():
if re.match('milk', line):
print("Pass the keyword is matched on page " + str(i), ": " + line)
else:
print("Fail")
re.match only returns a match if it exists at the beginning of a string. What you're probably looking for is re.search
Documentation: https://docs.python.org/3/library/re.html#search-vs-match
The solution is memorizing the match in a list instead of printing an immediate result.
The print should be done only after reading all the file
# [...]
loi = [] # Lines of Interest
for i in range(0, NumPages):
page = object.getPage(i)
text = page.extractText()
for line in text.splitlines():
if re.match('milk', line):
loi.append(f'{i}:{line}')
# Result
if len(loi) > 0: # or greater than a threshold
print('Pass. The keyword is matched on the following pages:')
print('\n'.join(loi))
else:
print('Fail.')
i am using textract to convert doc/docx file to text
here is my method
def extract_text_from_doc(file):
temp = None
temp = textract.process(file)
temp = temp.decode("UTF-8")
text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
return ''.join(text)
I have two doc files both with docx extension. when i try to convert one file to string it is working fine but for other one it is throwing exception
'There is no item named \'word/document.xml\' in the archive'
I tried to look further and i found that zipfile.ZipFile(docx) is causing the problem
Code looks like this
def process(docx, img_dir=None):
text = u''
# unzip the docx in memory
zipf = zipfile.ZipFile(docx)
filelist = zipf.namelist()
# get header text
# there can be 3 header files in the zip
header_xmls = 'word/header[0-9]*.xml'
for fname in filelist:
if re.match(header_xmls, fname):
text += xml2text(zipf.read(fname))
# get main text
doc_xml = 'word/document.xml'
text += xml2text(zipf.read(doc_xml))
# some more code
In the above code, for the file(for which it is working) returns filelist with values like 'word/document.xml', 'word/header1.xml'
but for the file(for which it is not working) its returns filelist with values
['[Content_Types].xml', '_rels/.rels', 'theme/theme/themeManager.xml', 'theme/theme/theme1.xml', 'theme/theme/_rels/themeManager.xml.rels']
since second filelist dont contain 'word/document.xml'
doc_xml = 'word/document.xml'
text += xml2text(zipf.read(doc_xml))
is throwing exception(internally it try to open file name with word/document.xml)
can anyone please help me. i dont know its problem with docx file or code.
I am trying to modify a line in sql file in a directory.
Currently have written the code which navigates to files directory according to user input .
each sql file in that dir has these 2 lines:
--liquibase formatted sql
--changeset Jack:1 runOnChange:true splitStatements:false stripComments:false
I am trying to do is loop through all files and I want to change-set every-time the script runs.
so those 2 lines will look like this :
--liquibase formatted sql
--changeset Ryan:2 runOnChange:true splitStatements:false stripComments:false
The part I want to change in line is constant but the content is different for each file like in other file it will be Jie:6 I want to replace that with Priyal:7. So the name part is the person who is running the script and the number after : is incremented
Is there a cleaner way to achieve this :
This is just sample code I have which configures path and everything :
anbpath = os.path.abspath("copy_views.py")
print(anbpath)
sqldir = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'database'))
path_to_views = sqldir+"/sql/edm/changes/"+source_release_version+"/mutable/view"
print(path_to_views)
destdir = os.path.abspath(os.path.join(os.path.dirname( __file__ ),'..', 'database'))
path_to_dest_rel_ver = destdir+"/sql/edm/changes/"+dest_release_version
path_to_dest = path_to_dest_rel_ver+"/mutable/view"
I will traverse all files in path_to_dest using os.walk
If your file is called "file.txt" and if the name is constant then what you are looking for is
# Read the contents of file
with open("file.txt", 'r') as fp:
lines = fp.readlines()
#Idendify tokens and numbers with in them and increment them
words = lines[1].split()
tokens = words[1].split(":")
words[1] = "name{0}:{1}".format(int(tokens[0][4:])+1, int(tokens[1])+1)
lines[1] = ' '.join(words)
# Write back the updated lines
with open("file.txt", 'w') as fp:
fp.writelines(lines)
Initial content of file
"--liquibase formatted sql",
"--changeset name3:21 runOnChange:true splitStatements:false stripComments:false"]
Afer modification
"--liquibase formatted sql",
"--changeset name4:22 runOnChange:true splitStatements:false stripComments:false"]
However, if the name is not constant then we can still split the token at ":" to indentify the number after ":" but to indentify the number ending at the name we will have to use regex.
You can use regex to solve this:
The following code will replace the string "name1:1" by "name2:2" if it finds the correct sequence and preserve the rest of the line:
import re
# read the file, line by line and then:
line_fixed = re.sub(r"(--changeset )name1:1(.*)", r"\1name2:2\2", line)
# then save the fixed line to a temporary file until you read all file
Here's a way to do it with regular expressions:
import re
lines = [
"--liquibase formatted sql",
"--changeset Jack:2 runOnChange:true splitStatements:false stripComments:false",
"--liquibase formatted sql",
"--changeset Ryan:6 runOnChange:true splitStatements:false stripComments:false",
# etc ...
]
def update_line(line):
p = re.compile(r'--changeset (.+):(\d+) runOnChange')
matches = p.match(line)
if not matches:
return line
else:
replacement = '--changeset {}:{} runOnChange'.format(matches.group(1),
int(matches.group(2))+1)
return p.sub(replacement, line)
for line in lines:
updated_line = update_line(line)
print(repr(updated_line))
Output:
'--liquibase formatted sql'
'--changeset Jack:3 runOnChange:true splitStatements:false stripComments:false'
'--liquibase formatted sql'
'--changeset Ryan:7 runOnChange:true splitStatements:false stripComments:false'
I have a text file (heavily modified for this example) which has some data that I want to extract and do some calculations with it. However the text file is extremely messy, so I'm trying to clean it up and write it out to new files first.
Here is the .txt file I'm working with: http://textuploader.com/5elql
I am trying to extract the data which is under the titles (called “Important title”). The only possible way to do that is to first locate a string which always occurs in the file, and its called “DATASET” because all the mess above and below the important data will cover an arbitrary number of lines, difficult to remove manually. Once that’s done I want to store the data in separate files so that it is easier to analyse like this:
http://textuploader.com/5elqw
The file names will be concatenated with the title + the date.
Here is what I have tried so far
with open("example.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
OUTPUT
IMPORTANT TITLE 1
DATE: 12/30/2015
IMPORTANT TITLE 2
DATE: 01/03/2016
So it appears my loop manages to locate the lines where the titles inside the file are and print them out. But this is where I run out of steam. I have no idea on how to extract the data under those titles from there onwards. I have tried using file.readlines() but it outputs all the mess that is in between Important Title 1 and Important Title 2.
Any advice on how I can read all the data under the titles and output them into separate files? Thanks for your time.
You could use regex.
import re
pattern = r"(\s+X\s+Y\s*)|(\s*\d+\s+\d+\s*)"
prog = re.compile(pattern)
with open("example.txt") as file:
cur_filename = ''
content = ""
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
elif line.startswith("DATE:"):
fileDate = line[6:]
cur_filename = (fileTitle.strip() + fileDate.strip()).replace('/', '-')
print(cur_filename)
content_title = fileTitle + line
elif prog.match(line):
content += line
elif cur_filename and content:
with open(cur_filename, 'w') as fp:
fp.write(content_title)
fp.write(content)
cur_filename = ''
content = ''
I don't know exactly how you want to store your data but assuming you want a dictionary you could use regex to check if the incoming line matched the pattern, then because fileTitle isn't global you could use that as the key and add the values. I also added rstrip('\r\n') to remove the newline characters after fileTitle.
import re
#if you don't want to store the X and Y, just use re.compile('\d\s+\d+')
p = re.compile('(\d\s+\d+)|(X\s+Y)')
data={}
with open("input.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:].rstrip('\r\n')
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
if p.match(line):
if fileTitle not in data:
data[fileTitle]=[]
line=line.rstrip('\r\n')
data[fileTitle].append(line.split('\t'))
if len(data[fileTitle][len(data[fileTitle])-1]) == 3:
data[fileTitle][len(data[fileTitle])-1].pop()
print data
Yet another regex solution:
sep = '*************************\n'
pattern = r'DATASET[^%]*'
good_stuff = re.compile(pattern)
pattern = r'^DATASET: (.*?)$'
title = re.compile(pattern, flags = re.MULTILINE)
pattern = r'^DATE: (.*?)$'
date = re.compile(pattern, flags = re.MULTILINE)
with open(r'foo.txt') as f:
data = f.read()
for match in good_stuff.finditer(data):
data = match.group()
important_title = title.search(data).group(1)
important_date = date.search(data).group(1)
important_date = important_date.replace(r'/', '-')
fname = important_title + important_date + '.txt'
print(sep, fname)
print(data)
##with open(fname, 'w') as f:
## f.write(data)
I have several .txt files and I need to extract certain data from them. Files looks similar, but each of them stores different data. Here is an example of that file:
Start Date: 21/05/2016
Format: TIFF
Resolution: 300dpi
Source: X Company
...
There is more information in the text files, but I need to extract the start date, format and the resolution. Files are in the same parent directory ("E:\Images") but each file has its own folder. Therefore I need a script for recursive reading of these files. Here is my script so far:
#importing a library
import os
#defining location of parent folder
BASE_DIRECTORY = 'E:\Images'
#scanning through subfolders
for dirpath, dirnames, filenames in os.walk(BASE_DIRECTORY):
for filename in filenames:
#defining file type
txtfile=open(filename,"r")
txtfile_full_path = os.path.join(dirpath, filename)
try:
for line in txtfile:
if line.startswidth('Start Date:'):
start_date = line.split()[-1]
elif line.startswidth('Format:'):
data_format = line.split()[-1]
elif line.startswidth('Resolution:'):
resolution = line.split()[-1]
print(
txtfile_full_path,
start_date,
data_format,
resolution)
Ideally it might be better if Python extracts it together with a name of ech file and saves it in a text file. Because I don't have much experience in Python, I don't know how to progress any further.
Here is the code I've used:
# importing libraries
import os
# defining location of parent folder
BASE_DIRECTORY = 'E:\Images'
output_file = open('output.txt', 'w')
output = {}
file_list = []
# scanning through sub folders
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
if 'Start Date:' in line:
output[f].append(line)
elif 'Format' in line:
output[f].append(line)
elif 'Resolution' in line:
output[f].append(line)
tabs = []
for tab in output:
tabs.append(tab)
tabs.sort()
for tab in tabs:
output_file.write(tab + '\n')
output_file.write('\n')
for row in output[tab]:
output_file.write(row + '')
output_file.write('\n')
output_file.write('----------------------------------------------------------\n')
raw_input()
You do not need regular expressions. You can use
basic string functions:
txtfile=open(filename,"r")
for line in txtfile:
if line.startswidth("Start Date:"):
start_date = line.split()[-1]
...
break if you have all information collected.
To grab the Start Date, you can use the following regex:
^(?:Start Date:)\D*(\d+/\d+/\d+)$
# ^ anchor the regex to the start of the line
# capture the string "Start Date:" in a group
# followed by non digits zero or unlimited times
# followed by a group with the start date in it
In Python this would be:
import re
regex = r"^(?:Start Date:)\D*(\d+/\d+/\d+)$"
# the variable line points to your line in the file
if re.search(regex, line):
# do sth. useful here
See a demo on regex 101.