I've got a question about a code that's getting text string from a pdf file and returns the output in a .csv
The output is stored in Output.csv. Like you can see it returns value on p.27 here the code works and 29, p. 28 is missing. What i want to return is textstring on p. 28 code not working.
Can somebody tell me what im doing wrong? In the 2nd code pdfminer does read out the proper output that is needed.
import re, csv, os
import sys, time
from tqdm import tqdm
import multiprocessing as mp
from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
# method 3: object oriented programming
class Program:
#initialisation (happens when Program() is called for the first time)
def __init__(self):
# locations
# this defines the location of the workspace and directory of the data to process
self.ws_loc = Path("C:/Users/pco/Desktop/workspace")
self.dat_loc = Path("C:/Users/pco/Desktop/workspace/data/Test")
# lookuptable
# this converts the lookuptable from maximo to a list which can be used for comparison
self.lu_file = self.ws_loc / "lookuptable.csv"
with open(self.lu_file, newline='') as f:
reader = csv.reader(f)
self.lu_list = list(filter(None,list(reader)))
self.lu_list = [each[0] for each in self.lu_list]
def listener(self,q):
'''listens for messages on the q (queue), writes (appends) to file (output.csv). '''
# open output.csv in location workspace/data/ and use as 'f'
with open(self.ws_loc / 'output.csv', 'a') as f:
#start infinite listening loop until 'kill' message is received
while 1:
# get the message which is first in q (queue)
m = q.get()
# break loop if message is kill and close file 'output.csv'
if m == 'kill':
f.close()
break
# if message is not 'kill' then write message to file and flush file
f.write(m)
f.flush()
def worker(self, file, q):
''' processes a pdf file given by main() and writes output to q (queue)'''
# init PDF class (this class is used to get pages from the PDF and process pdftext)
PDF = self.PDF(self.dat_loc,self.lu_list,0)
# get all the pages from PDF: contains pages = [page1, ..., pageN]
# pageN = "bla bla \n bla etc."
PDFpages = PDF.getPages(file)
pages = []
for page in PDFpages:
pages.append(page)
# varargs defines extra data for files (this is where metadata is stored)
# varargs should not be filled here, but it is initialized here.
varargs = ''
# check if file is a manual (this can be seen as an example for a varargs entry)
# it should contain atleast ',' (this creates a new column entry in the csv)
# PDF.fileCategory() which is a class within the Program class, can be taken as an example
varargs+= PDF.fileCategory(file,pages) + ',' + PDF.fileSupplier(file, pages) + ',' + PDF.fileRev(file, pages)
# new vararg can be added like: varargs+= THE_VARARG
# initialise pageNum (which is a page number identifier inside the for loop)
pageNum = 1
# create an empty datastack (which is the message that will be send to q (queue))
datastack = ''
# for each page do...
for page in pages:
'''!!! for each page look for tags (THIS IS WHERE THE REGEX HAPPENS PDF.find_tag()) !!!'''
found_strings, found = PDF.find_tag(page)
# found_stringsrev, foundrev = PDF.find_rev(page)
# if tags are found, then fix the tags such that they are correct with
# Program.putStripe() (or self.putStripe()) it changes 12AB1234A to 12-AB-1234-A
# if foundrev:
# string = ''
# fixedstring = ''
# for stringrev in found_stringsrev:
# # fill datastack with found tags
# datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + ',' + stringrev + ',' + varargs + '\n'
if found:
for string in found_strings:
# if correct, do not change
fixedstring = string
# check if the tag matches the correct regexpression ('regex' or 're')
if re.match('^(\d{1,2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', string)!=None:
# else fix the tag
fixedstring = self.putStripe(string)
# fill datastack with found tags
datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + varargs + '\n'
# next page, so pageNum becomes pageNum + 1
pageNum +=1
# if the datastack is empty, we are still interested in the varargs:
# (so empty tag columns are added)
if datastack=='':
datastack = file + ',' + ',' + ',' + varargs + '\n'
# put the datastack message inside of the q (queue)
q.put(datastack)
# terminate the PDF class so that the pdf file is closed in a correct way
PDF.terminate()
# return (in case the datastack should be printed)
return datastack
def putStripe(self,input):
'''This function fixes a tag that is not correct'''
# strip the tag from spaces
input = re.sub(' ','',input)
# for each string that matches the expression write to words
words = re.findall('[0-9][A-Za-z]+', input)
words += re.findall('[A-Za-z][0-9]+', input)
# for each match inside the tag add a '-' in the second position
for word in words:
i = input.find(word)+1
input = input[:i] + '-' + input[i:]
# return the fixed tag
return input
def main(self):
try:
# initiate time
t = time.time()
# create pools for paralell pooling (max cpu threads is optained automatically)
pool = mp.Pool(mp.cpu_count() + 2)
# create a manager
manager = mp.Manager()
# from the pool manager create a queue object which can be used to
# exchange data between the worker and listener
q = manager.Queue()
# start up listener first
# ignore warning, it is being used
watcher = pool.apply_async(self.listener, (q,))
# fire off workers (basically assign them jobs)
jobs = []
# NOTE: FOR LOOPS DO NOT CAUSE A LOOP, CODE PROCEEDS WITH PARALLEL THREADING
# AS IF THE RESULT OF EACH LOOP IS INSTANTLY COMPLETED
# each file in the data location is a job
for file in os.listdir(self.dat_loc):
# assign the job to a worker
job = pool.apply_async(self.worker, (file, q))
# append the job to jobs (for data aquisition)
jobs.append(job)
# this is used to get the data back from jobs
for job in tqdm(jobs):
#print('')
#print(job.get()[:-1])
job.get()
# printed elapsed time (good for project management)
print('elapsed time = ' + str(time.time()-t) + ' seconds')
# catch interupt and try to properly terminate workers (might take time)
# best to just do everything in batches and dont interrupt
except KeyboardInterrupt:
print("\nCaught KeyboardInterrupt, terminating workers")
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
pool.terminate()
SystemExit(1)
# always excecute (kills workers and listener)
finally:
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
def execute(self):
self.main()
class PDF:
# from PDF.
def __init__(self,dat_loc,lu_list,maxpages):
self.dat_loc = dat_loc
self.lu_list = lu_list
self.lu_list_f = 0
self.password = ""
self.maxpages = maxpages
self.caching = True
self.rsrcmgr = PDFResourceManager()
self.retstr = StringIO()
self.laparams = LAParams()
self.device = TextConverter(self.rsrcmgr, self.retstr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.pagenos=set()
# from PDF.
def getPages(self,file):
self.fp = open(self.dat_loc / file, 'rb')
pages = PDFPage.get_pages(self.fp,
self.pagenos,
maxpages=self.maxpages,
password=self.password,
caching=self.caching,
check_extractable=True)
return pages
# from PDF.
def fileCategory(self,file,pages):
rules = []
rules.append(['Manual',['ANLAGE - INSTALLATION','User Guide','MANUAL','Manual','manual','Handleiding','handleiding','Instruction','instructions','Instructie', 'Guide', 'GUIDE']])
rules.append(['Specification',['SPECIFICATION','Specification','Specificatie']])
rules.append(['Datasheet',['DATA BOOK','UTILITIES LIST','DATA PACKAGE','Data Package','data-sheet','Datasheet','DATASHEET','datasheet','DATA SHEET','Data Sheet','Data sheet','data sheet']])
rules.append(['Spare part list',['SPARE PARTS LIST']])
rules.append(['Invoice',['BILL OF MATERIAL','invoice','Invoice','INVOICE','Purchase order','Purchase Order','PURCHASE ORDER']])
rules.append(['Schematic Diagram',['SCHEMATIC DIAGRAM','Schematic Diagram','Schematic diagram', 'ISOMETRIC', 'Isometric', 'isometric']])
rules.append(['Checklist', ['Checklist', 'CHECKLIST', 'CHECKSHEET', 'Checksheet']])
rules.append(['Certificates', ['Certificate', 'CERTIFICATE', 'Zertifikat', 'ZERTIFIKAT', 'Certificat', 'CERTIFICAT']])
rules.append(['Required documents list', ['REQUIRED SUBMITTAL DOCUMENTS']])
fileCategory = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileCategory += ',' + category
else:
fileCategory += ',' + 'Unreadable'
return fileCategory
# from PDF.
def fileSupplier(self,file,pages):
rules = []
rules.append(['JE Jacobs',['JE Jacobs', 'JE JACOBS', 'Jacobs', 'JACOBS']])
rules.append(['Emerson',['Emerson', 'Emerson Process Management', 'EMERSON',]])
rules.append(['Air Liquide',['Air Liquide', 'AIR LIQUIDE']])
rules.append(['Rosemount',['ROSEMOUNT', 'Rosemount']])
rules.append(['Deltak',['Deltak', 'DELTAK']])
rules.append(['AviComp',['AVICOMP', 'Avicomp', 'avicomp']])
fileSupplier = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileSupplier += ',' + category
else:
fileSupplier += ',' + 'Supplier N/A'
return fileSupplier
# from PDF.
def fileRev(self,file,pages):
fileRev = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for line in self.pagestr(page).splitlines():
if re.match('^(Rev.*).*$', line):
found = True
if found:
break
if found:
break
counter+=1
if found:
fileRev += ',' + line
else:
fileRev += ',' + ''
return fileRev
# from PDF.
def find_string_lookup(self,page,pageNum,file,varargs):
datastack = []
data = []
found = False
for line in self.pagestr(page).splitlines():
line = re.sub('[^A-Za-z0-9]+', '', line)
counter = 0
for tag in self.lu_list_f:
if line.find(tag)!=-1:
found = True
data = file + ',' + str(self.lu_list[counter][0]) + ',' + str(pageNum) + varargs +'\n'
if data not in datastack:
datastack += [data]
counter+=1
return datastack, found
# from PDF.
def find_string(self,page,strings,Method=None):
datastack = []
data = []
found = False
if Method=='ALPHABET_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line) for line in strings]
elif Method=='ALPHABETCAPS_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line).upper() for line in strings]
elif Method=='ALPHABETCAPS':
tags = [line.upper() for line in strings]
else:
tags = strings
for line in self.pagestr(page).splitlines():
if Method=='ALPHABET_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line)
elif Method=='ALPHABETCAPS_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line).upper()
elif Method=='ALPHABETCAPS':
line = line.upper()
i = 0
for tag in tags:
if tag != '':
if line.find(tag)!=-1:
found = True
data = strings[i]
if data not in datastack:
datastack += [data]
i+=1
return datastack, found
# from PDF.
def find_tag(self,page):
datastack = []
found = False
for line in self.pagestr(page).splitlines():
tags = re.findall('^(\d{2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', line)
for tag in tags:
if tag not in datastack:
datastack += [tag]
found = True
return datastack, found
# from PDF.
# def find_rev(self,page):
# datastack = []
# found = False
# for line in self.pagestr(page).splitlines():
# tags = re.findall('^(Rev.*).*$', line)
# for tag in tags:
# if tag not in datastack:
# datastack += [tag]
# found = True
# return datastack, found
# from PDF.
def pagestr(self,page):
self.retstr.truncate(0)
self.retstr.seek(0)
self.interpreter.process_page(page)
return self.retstr.getvalue()
# from PDF.
def terminate(self):
self.fp.close()
self.device.close()
self.retstr.close()
# start the code (the proper way)
if __name__ == '__main__':
Program().execute()
If i read out the pdf with this code in python (also with pdfminer):
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
class glb():
workspace_folder = Path('C:/Users/pco/Desktop/workspace')
data_folder = Path('C:/Users/pco/Desktop/workspace/data/Test')
lookup_file = workspace_folder / "lookuptable.csv"
with open(lookup_file, newline='') as f:
reader = csv.reader(f)
lookup_list = list(reader)
lookup_list_filtered = list(filter(None,[re.sub('[^A-Za-z0-9]+', '', str(line)) for line in lookup_list]))
def find_tagnumbers(path):
pagelines = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
page_no = 1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
page_str = retstr.getvalue()
pagelines.append(page_str.splitlines())
retstr.truncate(0)
retstr.seek(0)
page_no +=1
page_no +=-1
print(pagelines)
fp.close()
device.close()
retstr.close()
return 1
find_tagnumbers('C:/Users/pco/Desktop/workspace/data/Test/1845613_1_27_Marked.pdf')
it does returns 47-AT -0053. But if i run the code below it doesn't return the value in output file. Output when i print pagelines
p.s. my coding skills is beginner (so i write out all the steps)
I have 4 functions that should grab some text from a specific docx file that are highlighted scan4<colour> is the varialbe of the text, these functions are near identical but are searching and replacing different highglighted text, they print out the same but they dont replace the text
these are 2 functions of 4, the yellow one works while the green one doesnt
what the code does is it searches and replaces the text then encrypts then uses the encrypted string into the main document, the other does the exact same but searches for a different highlighted colour. I try to encrypt the first function works but the second one doesn't
This code searches for yellow encodes the text and then replaces it so the un-encrypted document shows the encrypted string of the contents
def securi1_key():
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Keys/Securi1.key", "rb")
global key1
key1 = file.read()
file.close()
def rewrite_yellow():
securi1_key()
save_yellow_text()
# get key from file
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Keys/Securi1.key", "rb")
texty = scan4yellow.decode("utf-8")
encodedy = texty.encode()
# encrypt message
f = Fernet(key1)
encryptedy = f.encrypt(encodedy)
print(scan4yellow)
print(scan4yellow.decode("utf-8"))
document = docx.Document(f1)
for paragraph in document.paragraphs:
if scan4yellow.decode("utf-8") in paragraph.text:
inline = paragraph.runs
# loops for runs
for i in range(len(inline)):
if scan4yellow.decode("utf-8") in inline[i].text:
text = inline[i].text.replace(scan4yellow.decode("utf-8"), encryptedy.decode("utf-8"))
inline[i].text = text
document.save(f1)
def save_yellow_text():
securi1_key()
fp = f1
p = Path(fp)
filename1 = p.stem
storedtexty = filename1 + " Yellow Text"
storedtextencryptedy = storedtexty + ".encrypted"
list_of_files = os.listdir("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/")
if storedtexty in list_of_files:
storedtexty = (storedtexty + "1")
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtexty, "w+")
file.write(scan4yellow.decode("utf-8"))
input_file1 = ("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtexty)
if storedtextencryptedy in list_of_files:
storedtextencryptedy = (storedtextencryptedy + "1")
output_file1 = ("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtextencryptedy)
with open(input_file1, "rb") as f:
data = f.read()
fernet = Fernet(key1)
encrypted = fernet.encrypt(data)
with open(output_file1, "wb") as f:
f.write(encrypted)
file.close()
os.remove(input_file1)
this code should do the exact same but for the colour green :
def securi2_key():
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Keys/Securi2.key", "rb")
global key2
key2 = file.read()
file.close()
def rewrite_green():
securi2_key()
save_green_text()
# get key from file
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Keys/Securi2.key", "rb")
textg = scan4green.decode("utf-8")
encodedg = textg.encode()
print(encodedg)
# encrypt message
f = Fernet(key2)
encryptedg = f.encrypt(encodedg)
print(encryptedg)
document = docx.Document(f1)
for paragraph in document.paragraphs:
if scan4green.decode("utf-8") in paragraph.text:
inline = paragraph.runs
# loops for runs
for i in range(len(inline)):
if scan4green.decode("utf-8") in inline[i].text:
text = inline[i].text.replace(scan4green.decode("utf-8"), encryptedg.decode("utf-8"))
inline[i].text = text
document.save(f1)
def save_green_text():
securi2_key()
fp = f1
p = Path(fp)
filename2 = p.stem
storedtextg = filename2 + " Green Text"
storedtextencryptedg = storedtextg + ".encrypted"
list_of_files = os.listdir("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/")
if storedtextg in list_of_files:
storedtextg = (storedtextg + "1")
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtextg, "w+")
file.write(scan4green.decode("utf-8"))
print(scan4green.decode("utf-8") + "tested1")
input_file2 = ("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtextg)
if storedtextencryptedg in list_of_files:
storedtextencryptedg = (storedtextencryptedg + "1")
output_file2 = ("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtextencryptedg)
with open(input_file2, "rb") as f:
data = f.read()
fernet = Fernet(key2)
encrypted = fernet.encrypt(data)
with open(output_file2, "wb") as f:
f.write(encrypted)
file.close()
os.remove(input_file2)
I should have some incoherent string replace the actual text and the actual text saved in another file encrypted, but all this does is work for the first yellow function but not the green function
Ideally it should take the text from the read in file, make a copy write it out to a file and encrypt that then take the string of encryption from that and replace it where it was in the read in file, but it only works for the yellow code while the green and the other code which are near identical do not work
I have found an answer, what I changed has nothing to do with the code I provided but when I read in the text it was scan4green = (word.find(tag_t).text.encode('utf-8').lower()). this apparently caused it not to work properly but the scan4yellow did work because i didn't have .lower() attached to the end.
I have a PDF file and I am trying to find a specific text in the PDF and highlight it using Python.
I found PyPDF2, which can highlight part of a PDF when we give the coordinates of the wanted highlight position in the file.
I am trying to find a tool which can give me the position of a given text in the PDF.
PyMuPDF can find text by coordinates. You can use this in conjunction with the PyPDF2 highlighting method to accomplish what you're describing. Or you can just use PyMuPDF to highlight the text.
Here is sample code for finding text and highlighting with PyMuPDF:
import fitz
### READ IN PDF
doc = fitz.open("input.pdf")
for page in doc:
### SEARCH
text = "Sample text"
text_instances = page.search_for(text)
### HIGHLIGHT
for inst in text_instances:
highlight = page.add_highlight_annot(inst)
highlight.update()
### OUTPUT
doc.save("output.pdf", garbage=4, deflate=True, clean=True)
With the new version of PyMuPDF, some methods got depreciated.
Here is the sample code as per the recent version. Secondly, I've also added a comment for each highlight which facilities the user to transverse.
pdfIn = fitz.open("page-4.pdf")
for page in pdfIn:
print(page)
texts = ["SEPA", "voorstelnummer"]
text_instances = [page.search_for(text) for text in texts]
# coordinates of each word found in PDF-page
print(text_instances)
# iterate through each instance for highlighting
for inst in text_instances:
annot = page.add_highlight_annot(inst)
# annot = page.add_rect_annot(inst)
## Adding comment to the highlighted text
info = annot.info
info["title"] = "word_diffs"
info["content"] = "diffs"
annot.set_info(info)
annot.update()
# Saving the PDF Output
pdfIn.save("page-4_output.pdf")
If you are on Windows and have Acrobat Pro (not reader), you can try the old Component Object Model with Python or VBA.
import win32com, winerror, os
from win32com.client.dynamic import ERRORS_BAD_CONTEXT
ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
win32com.client.gencache.EnsureModule('{E64169B3-3592-47d2-816E-602C5C13F328}', 0, 1, 1)
avDoc = win32com.client.DispatchEx('AcroExch.AVDoc')
avDoc.Open(src, src)
avDoc.BringToFront()
pdDoc = avDoc.GetPDDoc()
jsoObject = pdDoc.GetJSObject()
for pageNo in range(1):
pdfPage = pdDoc.AcquirePage(pageNo)
pageHL = win32com.client.DispatchEx('AcroExch.HiliteList')
_ = pageHL.Add(0, 9000)
pageSel = pdfPage.CreatePageHilite(pageHL)
pdfText = ""
for wordNo in range(pageSel.GetNumText()):
word = pageSel.GetText(wordNo)
pdfText += word
if keyword in pdfText:
wordToHl = win32com.client.DispatchEx('AcroExch.HiliteList')
wordToHl.Add(wordNo, 1)
wordHl = pdfPage.CreateWordHilite(wordToHl)
rect = wordHl.GetBoundingRect()
annot = jsoObject.AddAnnot()
props = annot.GetProps()
props.Type = "Square"
props.Page = pageNo
props.Hidden = False
props.Lock = True
props.Name = word
props.NoView = False
props.Opacity = 0.3
props.ReadOnly = True
props.Style = "S"
props.ToggleNoView = False
props.PopupOpen = False
popupRect = [rect.Left - 5, rect.Top + 5, rect.Left + 40, rect.Top - 20]
props.Rect = popupRect
props.PopupRect = popupRect
props.StrokeColor = jsoObject.Color.Red
props.FillColor = jsoObject.Color.Yellow
annot.SetProps(props)
print(f'Found {keyword}')
import csv
import requests
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#CREATE CSV FILE
outfile = open("./output.csv", "wb")
writer = csv.writer(outfile)
#IMPORT MATCHES
import csv
with open('matches.csv', 'rb') as f:
reader = csv.reader(f)
matches = list(reader)
for id in matches:
id = str(id)
id = re.sub("[^0-9]","",id)
url = 'http://www.virtualpronetwork.com/apps/fvpaa/matches/match_report/' + id
print (url)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
#GET TEAMS AND SCORES
score = soup.findAll("div",{"class":"col-md-5 center"})
team_home = score[0]
team_home = str(team_home)
team_home = re.search('title="(.*)" />',team_home)
team_home = team_home.group(1)
team_away = score[1]
team_away = str(team_away)
team_away = re.search('title="(.*)" />',team_away)
team_away = team_away.group(1)
goals_home = score[2]
goals_home = str(goals_home)
goals_home = re.sub('</h2></div>','',goals_home)
goals_home = re.sub('<div class="col-md-5 center"><h2>','',goals_home)
goals_away = score[3]
goals_away = str(goals_away)
goals_away = re.sub('</h2></div>','',goals_away)
goals_away = re.sub('<div class="col-md-5 center"><h2>','',goals_away)
#GET HOME STATS
tables = soup.findChildren('table')
stats_home = tables[0]
list_of_rows_home = []
for row in stats_home.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_home.append(list_of_cells)
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
cell = list_of_rows_home[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_home:
del row[2]
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
row.append(team_home)
row.append(goals_home)
row.append(team_away)
row.append(goals_away)
#GET AWAY STATS
stats_away = tables[1]
list_of_rows_away = []
for row in stats_away.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_away.append(list_of_cells)
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
cell = list_of_rows_away[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_away:
del row[2]
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
row.append(team_away)
row.append(goals_away)
row.append(team_home)
row.append(goals_home)
#COMPILE INTO ONE TABLE
list_of_rows = list_of_rows_home + list_of_rows_away
#WRITE TO CSV
writer.writerows(list_of_rows)
My input file is a basic excel file with the match id's all lined up in column one of the excel file. When it creates the output file, it's blank. I am not getting any error messages either.
The issue is in your regex search, so perhaps change it to:
team_home = re.search('title="(.*)"',team_home)
team_home = team_home.group(1)
Alternative:
team_home = re.search('title="(.*)"/>',team_home)
team_home = team_home.group(1)
The /> is not needed, and this essentially makes title="" not match for group(1), which in turn creates an Attribute Error, and the script stops. If you want to include /> then remove the space in your regex pattern, since that is ultimately what kills it.
i writen this code:
import os
import re
import string
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
for lines, text in enumerate(f):
found = text.find('*SURFACE')
while found > -1:
print found, lines, text
found = text.find('*SURFACE', found + 1)
n.write(text)
##
f.close()
n.close()
This is what *.inp looks like (usually about 30Mb)
*SURFACE, NAME = BOTTOM, TYPE = ELEMENT
40012646, S2
40012647, S2
40012648, S2
40012649, S2
40012650, S2
40012651, S2
*SURFACE, NAME = ALL_INT_TIE_1, TYPE = ELEMENT
40243687, S3
40243703, S3
40243719, S3
40243735, S3
40243751, S3
40243767, S3
**
*TIE, NAME = INTERNAL_TIE, POSITION TOLERANCE = 1.0 , ADJUST=NO
SLAVE,MASTER
*TIE, NAME = SKN_REF_1
ALL_INT_FRONT, ALL_EXT_FRONT
*TIE, NAME = SKIN_LAT
ALL_INT_LAT, ALL_EXT_LAT
*TIE, NAME = SKIN_TIE_1
ALL_INT_TIE_1, ALL_INT_TIE_2
**
*SURFACE , NAME = TOP, COMBINE = UNION
TOP_1
TOP_2
**HM_UNSUPPORTED_CARDS
*END PART
*****
what he does it is clear. what I would like to achive is to get all the line between the *SURFACE that begin with a number, which then I will have to arrange differently, but I will worry about that later.
I rewrote the code cos i could not get it to work as suggested, now it is creating the blocks as I need them, but how do i work on each block?
I need to separate all the elements (number followed by S1, S2 and so on) and create groups for each block sorted by S1, S2 and so on the final result should look like
*ELSET, ELSET=TOP_S1
40221320, 40221306, 40221305, 40221304, 40221290, 40221289, 40221288, 40221274,
40221273, 40221272, 40221258, 40221257, 40221256, 40221242, 40221241, 40221240,
*SURFACE, NAME = TOP, TYPE = ELEMENT
TOP_S1,S1
import os
import re
import string
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
in_surface_block = False;
for line_num, text in enumerate(f):
found = text.find('*SURFACE')
if found > -1:
in_surface_block=True;
print found, line_num, text
surface_lines = []
continue
if in_surface_block:
m = re.match('\s*\d+\,\s*\w\d+',text)
if m:
mtext = m.group(0)
## p=surface_lines.append(text)
print mtext
## ntext = surface_lines.append(m.group(0))
## n.write(ntext)
##
f.close()
n.close()
I hope it is clear
I think this will do what you want:
import os
import re
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
in_surface_block = False;
for line_num, text in enumerate(f):
found = text.find('*SURFACE')
if found > -1:
in_surface_block=True;
print found, line_num, text
surface_lines = []
continue
if in_surface_block:
if re.match('\s*\d+', text):
surface_lines.append(text)
else:
in_surface_block = False
// do surface lines work here:
// surface_lines is a list with all the lines in a surface block
// that start with a number
...
##
f.close()
n.close()
Edit: Fixed logic error