Finding text string with pfdminer not consistent [Python] - python
I've got a question about a code that's getting text string from a pdf file and returns the output in a .csv
The output is stored in Output.csv. Like you can see it returns value on p.27 here the code works and 29, p. 28 is missing. What i want to return is textstring on p. 28 code not working.
Can somebody tell me what im doing wrong? In the 2nd code pdfminer does read out the proper output that is needed.
import re, csv, os
import sys, time
from tqdm import tqdm
import multiprocessing as mp
from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
# method 3: object oriented programming
class Program:
#initialisation (happens when Program() is called for the first time)
def __init__(self):
# locations
# this defines the location of the workspace and directory of the data to process
self.ws_loc = Path("C:/Users/pco/Desktop/workspace")
self.dat_loc = Path("C:/Users/pco/Desktop/workspace/data/Test")
# lookuptable
# this converts the lookuptable from maximo to a list which can be used for comparison
self.lu_file = self.ws_loc / "lookuptable.csv"
with open(self.lu_file, newline='') as f:
reader = csv.reader(f)
self.lu_list = list(filter(None,list(reader)))
self.lu_list = [each[0] for each in self.lu_list]
def listener(self,q):
'''listens for messages on the q (queue), writes (appends) to file (output.csv). '''
# open output.csv in location workspace/data/ and use as 'f'
with open(self.ws_loc / 'output.csv', 'a') as f:
#start infinite listening loop until 'kill' message is received
while 1:
# get the message which is first in q (queue)
m = q.get()
# break loop if message is kill and close file 'output.csv'
if m == 'kill':
f.close()
break
# if message is not 'kill' then write message to file and flush file
f.write(m)
f.flush()
def worker(self, file, q):
''' processes a pdf file given by main() and writes output to q (queue)'''
# init PDF class (this class is used to get pages from the PDF and process pdftext)
PDF = self.PDF(self.dat_loc,self.lu_list,0)
# get all the pages from PDF: contains pages = [page1, ..., pageN]
# pageN = "bla bla \n bla etc."
PDFpages = PDF.getPages(file)
pages = []
for page in PDFpages:
pages.append(page)
# varargs defines extra data for files (this is where metadata is stored)
# varargs should not be filled here, but it is initialized here.
varargs = ''
# check if file is a manual (this can be seen as an example for a varargs entry)
# it should contain atleast ',' (this creates a new column entry in the csv)
# PDF.fileCategory() which is a class within the Program class, can be taken as an example
varargs+= PDF.fileCategory(file,pages) + ',' + PDF.fileSupplier(file, pages) + ',' + PDF.fileRev(file, pages)
# new vararg can be added like: varargs+= THE_VARARG
# initialise pageNum (which is a page number identifier inside the for loop)
pageNum = 1
# create an empty datastack (which is the message that will be send to q (queue))
datastack = ''
# for each page do...
for page in pages:
'''!!! for each page look for tags (THIS IS WHERE THE REGEX HAPPENS PDF.find_tag()) !!!'''
found_strings, found = PDF.find_tag(page)
# found_stringsrev, foundrev = PDF.find_rev(page)
# if tags are found, then fix the tags such that they are correct with
# Program.putStripe() (or self.putStripe()) it changes 12AB1234A to 12-AB-1234-A
# if foundrev:
# string = ''
# fixedstring = ''
# for stringrev in found_stringsrev:
# # fill datastack with found tags
# datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + ',' + stringrev + ',' + varargs + '\n'
if found:
for string in found_strings:
# if correct, do not change
fixedstring = string
# check if the tag matches the correct regexpression ('regex' or 're')
if re.match('^(\d{1,2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', string)!=None:
# else fix the tag
fixedstring = self.putStripe(string)
# fill datastack with found tags
datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + varargs + '\n'
# next page, so pageNum becomes pageNum + 1
pageNum +=1
# if the datastack is empty, we are still interested in the varargs:
# (so empty tag columns are added)
if datastack=='':
datastack = file + ',' + ',' + ',' + varargs + '\n'
# put the datastack message inside of the q (queue)
q.put(datastack)
# terminate the PDF class so that the pdf file is closed in a correct way
PDF.terminate()
# return (in case the datastack should be printed)
return datastack
def putStripe(self,input):
'''This function fixes a tag that is not correct'''
# strip the tag from spaces
input = re.sub(' ','',input)
# for each string that matches the expression write to words
words = re.findall('[0-9][A-Za-z]+', input)
words += re.findall('[A-Za-z][0-9]+', input)
# for each match inside the tag add a '-' in the second position
for word in words:
i = input.find(word)+1
input = input[:i] + '-' + input[i:]
# return the fixed tag
return input
def main(self):
try:
# initiate time
t = time.time()
# create pools for paralell pooling (max cpu threads is optained automatically)
pool = mp.Pool(mp.cpu_count() + 2)
# create a manager
manager = mp.Manager()
# from the pool manager create a queue object which can be used to
# exchange data between the worker and listener
q = manager.Queue()
# start up listener first
# ignore warning, it is being used
watcher = pool.apply_async(self.listener, (q,))
# fire off workers (basically assign them jobs)
jobs = []
# NOTE: FOR LOOPS DO NOT CAUSE A LOOP, CODE PROCEEDS WITH PARALLEL THREADING
# AS IF THE RESULT OF EACH LOOP IS INSTANTLY COMPLETED
# each file in the data location is a job
for file in os.listdir(self.dat_loc):
# assign the job to a worker
job = pool.apply_async(self.worker, (file, q))
# append the job to jobs (for data aquisition)
jobs.append(job)
# this is used to get the data back from jobs
for job in tqdm(jobs):
#print('')
#print(job.get()[:-1])
job.get()
# printed elapsed time (good for project management)
print('elapsed time = ' + str(time.time()-t) + ' seconds')
# catch interupt and try to properly terminate workers (might take time)
# best to just do everything in batches and dont interrupt
except KeyboardInterrupt:
print("\nCaught KeyboardInterrupt, terminating workers")
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
pool.terminate()
SystemExit(1)
# always excecute (kills workers and listener)
finally:
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
def execute(self):
self.main()
class PDF:
# from PDF.
def __init__(self,dat_loc,lu_list,maxpages):
self.dat_loc = dat_loc
self.lu_list = lu_list
self.lu_list_f = 0
self.password = ""
self.maxpages = maxpages
self.caching = True
self.rsrcmgr = PDFResourceManager()
self.retstr = StringIO()
self.laparams = LAParams()
self.device = TextConverter(self.rsrcmgr, self.retstr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.pagenos=set()
# from PDF.
def getPages(self,file):
self.fp = open(self.dat_loc / file, 'rb')
pages = PDFPage.get_pages(self.fp,
self.pagenos,
maxpages=self.maxpages,
password=self.password,
caching=self.caching,
check_extractable=True)
return pages
# from PDF.
def fileCategory(self,file,pages):
rules = []
rules.append(['Manual',['ANLAGE - INSTALLATION','User Guide','MANUAL','Manual','manual','Handleiding','handleiding','Instruction','instructions','Instructie', 'Guide', 'GUIDE']])
rules.append(['Specification',['SPECIFICATION','Specification','Specificatie']])
rules.append(['Datasheet',['DATA BOOK','UTILITIES LIST','DATA PACKAGE','Data Package','data-sheet','Datasheet','DATASHEET','datasheet','DATA SHEET','Data Sheet','Data sheet','data sheet']])
rules.append(['Spare part list',['SPARE PARTS LIST']])
rules.append(['Invoice',['BILL OF MATERIAL','invoice','Invoice','INVOICE','Purchase order','Purchase Order','PURCHASE ORDER']])
rules.append(['Schematic Diagram',['SCHEMATIC DIAGRAM','Schematic Diagram','Schematic diagram', 'ISOMETRIC', 'Isometric', 'isometric']])
rules.append(['Checklist', ['Checklist', 'CHECKLIST', 'CHECKSHEET', 'Checksheet']])
rules.append(['Certificates', ['Certificate', 'CERTIFICATE', 'Zertifikat', 'ZERTIFIKAT', 'Certificat', 'CERTIFICAT']])
rules.append(['Required documents list', ['REQUIRED SUBMITTAL DOCUMENTS']])
fileCategory = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileCategory += ',' + category
else:
fileCategory += ',' + 'Unreadable'
return fileCategory
# from PDF.
def fileSupplier(self,file,pages):
rules = []
rules.append(['JE Jacobs',['JE Jacobs', 'JE JACOBS', 'Jacobs', 'JACOBS']])
rules.append(['Emerson',['Emerson', 'Emerson Process Management', 'EMERSON',]])
rules.append(['Air Liquide',['Air Liquide', 'AIR LIQUIDE']])
rules.append(['Rosemount',['ROSEMOUNT', 'Rosemount']])
rules.append(['Deltak',['Deltak', 'DELTAK']])
rules.append(['AviComp',['AVICOMP', 'Avicomp', 'avicomp']])
fileSupplier = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileSupplier += ',' + category
else:
fileSupplier += ',' + 'Supplier N/A'
return fileSupplier
# from PDF.
def fileRev(self,file,pages):
fileRev = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for line in self.pagestr(page).splitlines():
if re.match('^(Rev.*).*$', line):
found = True
if found:
break
if found:
break
counter+=1
if found:
fileRev += ',' + line
else:
fileRev += ',' + ''
return fileRev
# from PDF.
def find_string_lookup(self,page,pageNum,file,varargs):
datastack = []
data = []
found = False
for line in self.pagestr(page).splitlines():
line = re.sub('[^A-Za-z0-9]+', '', line)
counter = 0
for tag in self.lu_list_f:
if line.find(tag)!=-1:
found = True
data = file + ',' + str(self.lu_list[counter][0]) + ',' + str(pageNum) + varargs +'\n'
if data not in datastack:
datastack += [data]
counter+=1
return datastack, found
# from PDF.
def find_string(self,page,strings,Method=None):
datastack = []
data = []
found = False
if Method=='ALPHABET_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line) for line in strings]
elif Method=='ALPHABETCAPS_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line).upper() for line in strings]
elif Method=='ALPHABETCAPS':
tags = [line.upper() for line in strings]
else:
tags = strings
for line in self.pagestr(page).splitlines():
if Method=='ALPHABET_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line)
elif Method=='ALPHABETCAPS_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line).upper()
elif Method=='ALPHABETCAPS':
line = line.upper()
i = 0
for tag in tags:
if tag != '':
if line.find(tag)!=-1:
found = True
data = strings[i]
if data not in datastack:
datastack += [data]
i+=1
return datastack, found
# from PDF.
def find_tag(self,page):
datastack = []
found = False
for line in self.pagestr(page).splitlines():
tags = re.findall('^(\d{2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', line)
for tag in tags:
if tag not in datastack:
datastack += [tag]
found = True
return datastack, found
# from PDF.
# def find_rev(self,page):
# datastack = []
# found = False
# for line in self.pagestr(page).splitlines():
# tags = re.findall('^(Rev.*).*$', line)
# for tag in tags:
# if tag not in datastack:
# datastack += [tag]
# found = True
# return datastack, found
# from PDF.
def pagestr(self,page):
self.retstr.truncate(0)
self.retstr.seek(0)
self.interpreter.process_page(page)
return self.retstr.getvalue()
# from PDF.
def terminate(self):
self.fp.close()
self.device.close()
self.retstr.close()
# start the code (the proper way)
if __name__ == '__main__':
Program().execute()
If i read out the pdf with this code in python (also with pdfminer):
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
class glb():
workspace_folder = Path('C:/Users/pco/Desktop/workspace')
data_folder = Path('C:/Users/pco/Desktop/workspace/data/Test')
lookup_file = workspace_folder / "lookuptable.csv"
with open(lookup_file, newline='') as f:
reader = csv.reader(f)
lookup_list = list(reader)
lookup_list_filtered = list(filter(None,[re.sub('[^A-Za-z0-9]+', '', str(line)) for line in lookup_list]))
def find_tagnumbers(path):
pagelines = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
page_no = 1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
page_str = retstr.getvalue()
pagelines.append(page_str.splitlines())
retstr.truncate(0)
retstr.seek(0)
page_no +=1
page_no +=-1
print(pagelines)
fp.close()
device.close()
retstr.close()
return 1
find_tagnumbers('C:/Users/pco/Desktop/workspace/data/Test/1845613_1_27_Marked.pdf')
it does returns 47-AT -0053. But if i run the code below it doesn't return the value in output file. Output when i print pagelines
p.s. my coding skills is beginner (so i write out all the steps)
Related
Remove row from the CSV file if condition met
I am trying to scrape pickels.com.au. I am trying to update the pickels_dataset.csv file if the link is the same and if the price is not the same them I am removing the list and inserting the new row to the CSV file, but it doesn't remove the old entry from the CSV file. What would be the best way to remove and update the row in the CSV file. Below is my code... import requests from scrapy.selector import Selector import csv import re from tqdm import tqdm from time import sleep with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file: auction_csv_writer = csv.writer(auction_csv_file) live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future' api_request = requests.get(url=live_auctions_api) for auctions in api_request.json(): auction_link = auctions.get('viewSaleListingLink') if 'cars/item/search/-/listing/listSaleItems/' in auction_link: auction_request = requests.get(url=auction_link) response = Selector(text=auction_request.text) sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get() sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re) if sales_id == []: continue auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30' auction_sale_link_requests = requests.get(url=auction_sale_link) auctions_data = auction_sale_link_requests.json().get('SearchResults') if auctions_data == []: print("NO RESULTS") for auction_data in auctions_data: if int(auction_data.get('MinimumBid')) > 0: ids = auction_data.get('TargetId') main_title = auction_data.get('Title') short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model')) make = auction_data.get('M ake') model = auction_data.get('Model') variant = auction_data.get('Series') transmission = auction_data.get('Transmission') odometer = auction_data.get('Odometer') state = auction_data.get('Location').get('State') sale_price = auction_data.get('MinimumBid') link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids) link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}' sale_date = auction_data.get('SaleEndString') auction_values = [ main_title, short_title, make, model, variant, transmission, odometer, state, "${:,.2f}".format(sale_price).strip() , link, sale_date ] with open('pickels_dataset.csv', 'r+') as csv_read: auction_reader = list(csv.reader(csv_read)) for each in auction_reader: if link in each: each_link, each_price = each[9], each[0] if (link == each_link) and (sale_price != each_price): auction_reader.clear() print('New list found, old list deleted') auction_csv_writer.writerow(auction_values) print('New value added') continue elif (link == each[9]) and (sale_price == each[0]): print('Same result already exist in the file') continue else: auction_csv_writer.writerow(auction_values) print('Unique result found and added.') break
Your current script is opening your auction CSV file for appending, and then whilst it is still open, attempting to open it again for reading. This is probably why it is not updating as expected. A better approach would be to first read the entire contents of your existing saved auction file into a dictionary. The key could be the link which would then make it easy to determine if you have already seen an existing auction. Next scrape the current auctions and update the saved_auctions dictionary as needed. Finally at the end, write the contents of saved_auctions back to the CSV file. For example: import requests from scrapy.selector import Selector import csv import re auction_filename = 'pickels_dataset.csv' # Load existing auctions into a dictionary with link as key saved_auctions = {} with open(auction_filename, newline='', encoding='utf-8') as f_auction_file: for row in csv.reader(f_auction_file): saved_auctions[row[9]] = row # dictionary key is link live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future' api_request = requests.get(url=live_auctions_api) for auctions in api_request.json(): auction_link = auctions.get('viewSaleListingLink') if 'cars/item/search/-/listing/listSaleItems/' in auction_link: auction_request = requests.get(url=auction_link) response = Selector(text=auction_request.text) sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get() sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re) if sales_id == []: continue auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30' auction_sale_link_requests = requests.get(url=auction_sale_link) auctions_data = auction_sale_link_requests.json().get('SearchResults') if auctions_data == []: print("NO RESULTS") for auction_data in auctions_data: if int(auction_data.get('MinimumBid')) > 0: ids = auction_data.get('TargetId') main_title = auction_data.get('Title') short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model')) make = auction_data.get('Make') model = auction_data.get('Model') variant = auction_data.get('Series') transmission = auction_data.get('Transmission') odometer = auction_data.get('Odometer') state = auction_data.get('Location').get('State') minimum_bid = auction_data.get('MinimumBid') sale_price = "${:,.2f}".format(minimum_bid).strip() link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids) link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}' sale_date = auction_data.get('SaleEndString') auction_values = [ main_title, short_title, make, model, variant, transmission, odometer, state, sale_price, link, sale_date ] if link in saved_auctions: if saved_auctions[link][8] == sale_price: print('Same result already exists in the file') else: print('New value updated') saved_auctions[link] = auction_values # Updated the entry else: print('New auction added') saved_auctions[link] = auction_values # Update the saved auction file with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file: csv_auction_file = csv.writer(f_auction_file) csv_auction_file.writerows(saved_auctions.values()) If you want to also remove auctions that are no longer active, then it would probably be best to simply ignore the saved file and just write all current entries as is.
Parsing a dictionary in Python to my current table
I have a table that contains a few categories and two of them are: mac address and device name. I had a the list of my mac address written in my code (hardcoded) with their corresponding device names (ie deviceDict['00:00:00:00:00:00']= name) Now, I passed those mac addresses and device names to a text file to be read from that same Python code and parse it onto my table. The code currently recognizes the text file but it is not parsing that information onto the table. Here is the code: # File: WapLogParser.py # Desc: Parses a WAP log file and pulls out information relating to connected clients # Usage: python WapLogParser.py [file glob] import re import sys import glob import os deviceDict = dict() # Base table for storing client info # All names must match what is in the Wap Log file # Exceptions: Date, Wap Name, Device Name - which are provided outside of the result parsing table = [["Ssid", "Vlan", "Mac Address", "Connected Time", "Ip Address", "Rssi", "Date", "Wap Name", "Device Name"]] def ParseResult(result, date, wapName): lines = result.split('\n') lines = list(filter(None, lines)) # Any useful info will be at least 2 lines long if len(lines) == 1: return # create empty row data = [""] * len(table[0]) # for each item in the result place it in the correct spot in the row for line in lines: if line != "": # Parse the key/value pair m = re.match(r"(.*):\s\.*\s?(.*)", line) if m is not None: for idx in range(len(table[0])): if table[0][idx].lower() == m[1].lower(): data[idx] = m[2] else: break # Remove the '(dBm)' from the RSSI value data[5] = data[5].split()[0] # Append WAP specific items to row data[6] = date data[7] = wapName data[8] = GetDeviceName(data[2].upper()) # Add row to table table.append(data) def ParseFile(path): with open(path) as f: lines = f.readlines() result = "" command = "" date = "" # WAP name is always on the first line 16 characters in with 4 # unnecessary characters trailing wapName = lines[0].strip()[16:-4] for line in lines: line = line.strip() # Is an issued command? if line.startswith("/#"): if command != "": ParseResult(result, date, wapName) command = "" # reset the result for the new command result = "" m = re.match(r"^/#.*show\sclient.*stats$", line) if m is not None: command = line # Anything that is not a command add to the result else: result += line + "\n" # Do we have the date? if line.startswith("Current date:"): date = line.replace("Current date: ", "") # Print output to stderr def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) # Print a 2d array in a csv format def PrintAsCsv(table): for row in table: print(",".join(row)) def Main(): InitDeviceDict() numArgs = len(sys.argv) for filename in glob.iglob(sys.argv[numArgs - 1], recursive=True): # Globs get directories too if os.path.isfile(filename): eprint("Parsing " + filename) try: ParseFile(filename) except Exception as e: # Mainly for if we see a binary file eprint("Bad file: " + e) # Print in a format we can use PrintAsCsv(table) def GetDeviceName(macAddress): if macAddress in deviceDict: return deviceDict[macAddress] manufacturerPart = macAddress[:8] if manufacturerPart in deviceDict: return deviceDict[manufacturerPart] return 'Unknown Device' def InitDeviceDict(): with open('try.txt','r') as fo: for line in fo: deviceDict = {} line = line.split(',') macAddress = line[0].strip() manufacturerPart = line[1].strip() if macAddress in deviceDict: deviceDict[macAddress].append(manufacturerPart) else: deviceDict[macAddress]=(manufacturerPart) print(deviceDict) # entry point # script arguments: # WapLogParser.py [file glob] if __name__ == "__main__": Main() The issue is on the functions GetDeviceName and InitDeviceDict. When I run the code and then a batch file to display my info on excel, I keep getting "unknown device" (as if it is not recognizing the mac address I entered to produce the device name) Any way I can correct this? Thank you
The deviceDict that is populated in InitDeviceDict is not the global deviceDict. You are only modifying a function-local dictionary (and resetting it every line as well). Remove deviceDict = {} from that function and, at the top of the function use global deviceDict to declare that you are modifying the global. def InitDeviceDict(): global deviceDict with open('try.txt','r') as fo: for line in fo: line = line.split(',') macAddress = line[0].strip() manufacturerPart = line[1].strip() if macAddress in deviceDict: deviceDict[macAddress].append(manufacturerPart) else: deviceDict[macAddress]=[manufacturerPart]
Shoving Scrapy Objects into array when parsing; what am I doing wrong?
I've basically created a spider that follows a set of links acquired from an API, and then extracts text from the HTML body. I'm trying to append returned items to appropriate lists, which are then added to a dictionary. When I run the code, the resultant JSON file only successfully writes the first line. I am running Python 3.6 in a virtual environment on a Windows 10 64-bit machine, and I run pip-upgrade daily. from nltk.corpus import stopwords import smtplib from time import sleep # To prevent overwhelming the server between connections from bs4 import BeautifulSoup as soup import scrapy import mysql.connector as mariadb import sys from collections import Counter from pprint import pprint import json import re conn = mariadb.connect(user=dbuser, password=dbpassword, database=dbdatabase) c = conn.cursor() e = sys.exc_info()[0] c.execute("Select URL FROM [TABLE]") JobURLs = c.fetchall() for object in JobURLs: urls = [] url_string = str(object) rx = re.compile('\W\W\W$') res = rx.sub('', url_string) rx = re.compile('^\W\W') url = rx.sub('', res) urls.append(url) c.execute("Select JvId FROM [TABLE]") JobIDs = c.fetchall() for object in JobIDs: item = {} item['JvId'] = [] JobID_string = str(object) rx = re.compile('\W\W\W$') res = rx.sub('', JobID_string) rx = re.compile('^\W\W') JobID = rx.sub('', res) item['JvId'].append(JobID) class JobListing(scrapy.Spider): name = 'JobListingCrawler' start_urls = urls def parse(self, response): # pass item['urlText'] = response.url page_html = response.body page_soup = soup(page_html, 'lxml') for script in page_soup(['script', 'style']): script.extract() item['jobDescText'] = page_soup.get_text('''\n''', strip=True) ## TextCleaner Function for Word Counter text = item['jobDescText'].replace('\n', ' ') lines = [line.strip() for line in text.splitlines()] chunks = [phrase.strip() for line in lines for phrase in line.split(' ')] def chunk_space(chunk): chunk_out = chunk + ' ' return chunk_out text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') try: text = text.decode('unicode_escape').encode('ascii', 'ignore') except: print(e) pass text = re.sub('[^a-zA-Z,+3]', ' ', str(text)) text = text.lower().split() stop_words = set(stopwords.words('english')) text = [word for word in text if not word in stop_words] wordCounter = Counter(text) item['wordCounter'] = str(wordCounter) ## And now we parse for email addresses! prog = re.compile(r"[A-z0-9._%+-]+#[A-z0-9.-]+\.[A-z]{2,}") found = prog.search(item['jobDescText'].replace('\n', ' ')) try: item['email'] = str(found.group(0)) except: item['email'] = 'null' pass filename = 'results.jl' line = json.dumps(dict(item)) + '\n' with open(filename, 'a') as f: f.write(line) self.log('Saved Line to %s' % filename)
You just need to declare a Scrapy Item, which contains yours returned fields definion. After that, just need to config your setting file to allow Scrapy Feed Exports using the built-in JsonItemExporter for your extract data: FEED_URI: file:///tmp/export.json FEED_FORMAT: json
So silly me: I put the list variable within the For Loop, so each time the actions looped it would delete the previously written values. Moving them outside of the loop solved the problem. c.execute("Select URL FROM CareerOneStopJobs") JobURLs = c.fetchall() urls = [] for element in JobURLs: url_string = str(element) rx = re.compile('\W\W\W$') res = rx.sub('', url_string) rx = re.compile('^\W\W') url = rx.sub('', res) urls.append(url) c.execute("Select JvId FROM CareerOneStopJobs") JobIDs = c.fetchall() item = {} for JobID in JobIDs: item['JvId'] = [] JobID_string = str(JobID) rx = re.compile('\W\W\W$') res = rx.sub('', JobID_string) rx = re.compile('^\W\W') JobID = rx.sub('', res) item['JvId'] = JobID
Getting a KeyError on given code that is supposed to work
I was given this code to transform an arff file. I had to download the numpy library, and now when I try to run it with my files it gives me keyerrors such as " imgInfo[1][clstrDct[clstr]] += 1 # increment the cluster count KeyError: 'cluster35\r'" import numpy as np def xfrm(arFil='KBcls-100-10-20'): '''transform a clustered patch arff file to an image training / test file''' global imgDct, clstrDct, num, clsts, lne imgDct = {} clstrDct = {} with open(arFil + '.arff', 'r') as ptchFil: while True: # find Cluster attribute lne = ptchFil.readline() if lne == '': return 'EOF bfore one' if lne.lower().startswith('#attribute cluster'): clsts = lne[lne.find('{')+1 : lne.find('}')].split(',') num = len(clsts) break for i in range(len(clsts)): # map cluster names to integers 0+ w/ inverted mapping also clstrDct[clsts[i]] = i clstrDct[i] = clsts[i] while True: # first patch data line lne = ptchFil.readline() if lne == '': return 'EOF bfore two' if lne.startswith('#data'): break while True: lne = ptchFil.readline() # read through patch lines if lne == '': break # EOF if lne[-1] == '\n': lne=lne[:-1] # all end with \n except possibly the last line of the file attrs = lne.split(',') imgId = attrs[0] clstr = attrs[-1] cls = attrs[-2] try: imgInfo = imgDct[imgId] except KeyError: imgInfo = [cls, np.zeros((num), dtype=int)] # new cluster counting array imgDct[imgId] = imgInfo imgInfo[1][clstrDct[clstr]] += 1 # increment the cluster count with open(arFil + '-img.arff', 'w') as arFile: arFile.write('% from {0:}.arff: {1:} patch clusters\n%\n'.format(arFil, num)) arFile.write('#relation Image-Patch-Clusters\n#attribute Image-ID numeric\n') for i in range(num): arFile.write('#attribute {} numeric\n'.format(clstrDct[i])) # cluster attributes arFile.write('#attribute class {unknown, street, highway}\n#data') for imid,iminfo in imgDct.items(): arFile.write('\n{}, '.format(imid)) for i in range(num): arFile.write('{}, '.format(iminfo[1][i])) arFile.write('{}'.format(iminfo[0])) if __name__ == "__main__": xfrm('Test1Clust')
readline includes the line ending along with the rest of the content. This means that you have an extra \r, \n, or \n\r at the end of every attrs[-1]. This is why there is a \r in "cluster35\r". You can get rid of this using strip. clstr = attrs[-1].strip()
Getting the memory layout out of an (avr)elf file by useing python + pyElftools
I am creating my own bootloader for an ATXmega128A4U. To use the bootloader I want to transform the ELF-file of the firmware into a memory map used in the the ATXmega. For that I use python and the modul "pyelftools". The documentation of it is poor and so I run into a problem: I do not know what information I can use to get the address, offset etc. from the data at the sections. My goal is to create a bytearray, copy the data/code into it and transfer it to the bootlaoder. Below is my code: import sys # If pyelftools is not installed, the example can also run from the root or # examples/ dir of the source distribution. sys.path[0:0] = ['.', '..'] from elftools.common.py3compat import bytes2str from elftools.elf.elffile import ELFFile # 128k flash for the ATXmega128a4u flashsize = 128 * 1024 def process_file(filename): with open(filename, 'rb') as f: # get the data elffile = ELFFile(f) dataSec = elffile.get_section_by_name(b'.data') textSec = elffile.get_section_by_name(b'.text') # prepare the memory flashMemory = bytearray(flashsize) # the data section startAddr = dataSec.header.sh_offset am = dataSec.header.sh_size i = 0 while i < am: val = dataSec.stream.read(1) flashMemory[startAddr] = val[0] startAddr += 1 i += 1 # the text section startAddr = textSec.header.sh_offset am = textSec.header.sh_size i = 0 while i < am: print(str(startAddr) + ' : ' + str(i)) val = textSec.stream.read(1) flashMemory[startAddr] = val[0] startAddr += 1 i += 1 print('finished') if __name__ == '__main__': process_file('firmware.elf') Hope someone can tell me how to solve this problem.
I manged to solve the problem. don't read the data manualy from the stream by "textSec.stream.read" use "textSec.data()" instead. Internaly (see "sections.py") a seek operation in the file is done. Afterwards the data is read. The result will be the valid data chunk. The following code reads the code(text) section of a atxmega firmware and copies it into a bytearray which has the layout of the flash of an atxmega128a4u device. #vlas_tepesch: the hex conversation is not needed and the the 64k pitfall is avoided. sys.path[0:0] = ['.', '..'] from elftools.common.py3compat import bytes2str from elftools.elf.elffile import ELFFile # 128k flash for the ATXmega128a4u flashsize = 128 * 1024 def __printSectionInfo (s): print ('[{nr}] {name} {type} {addr} {offs} {size}'.format( nr = s.header['sh_name'], name = s.name, type = s.header['sh_type'], addr = s.header['sh_addr'], offs = s.header['sh_offset'], size = s.header['sh_size'] ) ) def process_file(filename): print('In file: ' + filename) with open(filename, 'rb') as f: # get the data elffile = ELFFile(f) print ('sections:') for s in elffile.iter_sections(): __printSectionInfo(s) print ('get the code from the .text section') textSec = elffile.get_section_by_name(b'.text') # prepare the memory flashMemory = bytearray(flashsize) # the text section startAddr = textSec.header['sh_addr'] val = textSec.data() flashMemory[startAddr:startAddr+len(val)] = val # print memory print('finished') if __name__ == '__main__': process_file('firmware.elf') Tanks for the comments!