Why does my python script with sleep in infinite loop stop running? - python

I'm working on a python script to transfer data from an .xlsx file to a html: I read/parse the excel with pandas and use beautifulsoup to edit the html (reading the paths to these two files from two .txt's). This, on its own, works. However, this script has to run constantly so everything is called in an infinite while that loops every 15 minutes, each time messages being displayed on the console.
My problem is the following: for some reason, after an aleatoric number of loops, the code just doesn't run anymore, and by that I mean no text on the console and no changes in the html file. When this happens, I have to rerun it in order to get it to function again.
Here is the main function:
def mainFunction():
if getattr(sys, 'frozen', False):
application_path = os.path.dirname(sys.executable)
elif __file__:
application_path = os.path.dirname(__file__)
excelFiles = open(str(application_path) +"\\pathsToExcels.txt")
htmlFiles = open(str(application_path) +"\\pathsToHTMLs.txt")
sheetFiles = open(str(application_path) +"\\sheetNames.txt")
print("Reading file paths ...")
linesEx = excelFiles.readlines()
linesHtml = htmlFiles.readlines()
linesSheet = sheetFiles.readlines()
print("Begining transfer")
for i in range (len(linesEx)):
excel = linesEx[i].strip()
html = linesHtml[i].strip()
sheet = linesSheet[i].strip()
print("Transfering data for " + sheet)
updater = UpdateHtml(excel, sheet, str(application_path) + "\\pageTemplate.html", html)
updater.refreshTable()
updater.addData()
updater.saveHtml()
print("Transfer done")
excelFiles.close()
htmlFiles.close()
sheetFiles.close()
UpdateHtml is the one actually responsible for the data transfer.
The "__main__" which also contains the while loop:
if __name__ == "__main__":
while(True):
print("Update at " + str(datetime.now()))
mainFunction()
print("Next update in 15 minutes\n")
time.sleep(900)
And finally, the batch code that launches this
python "C:\Users\Me\PythonScripts\excelToHtmlTransfer.py"
pause
From what I've noticed through trials, this situation doesn't occur when sleep is set to under 5 minutes (still happens for 5 minutes) or if it's omitted altogether.
Does anyone have any clue why this might be happening? Or any alternatives to sleep in this context?
EDIT: UpdateHtml:
import pandas as pd
from bs4 import BeautifulSoup
class UpdateHtml:
def __init__(self, pathToExcel, sheetName, pathToHtml, pathToFinalHtml):
with open(pathToHtml, "r") as htmlFile:
self.soup = BeautifulSoup(htmlFile.read(), features="html.parser")
self.df = pd.read_excel (pathToExcel, sheet_name=sheetName)
self.html = pathToFinalHtml
self.sheet = sheetName
def refreshTable(self):
#deletes the inner html of all table cells
for i in range(0, 9):
td = self.soup.find(id = 'ok' + str(i))
td.string = ''
td = self.soup.find(id = 'acc' + str(i))
td.string = ''
td = self.soup.find(id = 'nok' + str(i))
td.string = ''
td = self.soup.find(id = 'problem' + str(i))
td.string = ''
def prepareData(self):
#changes the names of columns according to their data
counter = 0
column_names = {}
for column in self.df.columns:
if 'OK' == str(self.df[column].values[6]):
column_names[self.df.columns[counter]] = 'ok'
elif 'Acumulate' == str(self.df[column].values[6]):
column_names[self.df.columns[counter]] = 'acc'
elif 'NOK' == str(self.df[column].values[6]):
column_names[self.df.columns[counter]] = 'nok'
elif 'Problem Description' == str(self.df[column].values[7]):
column_names[self.df.columns[counter]] = 'prob'
counter += 1
self.df.rename(columns = column_names, inplace=True)
def saveHtml(self):
with open(self.html, "w") as htmlFile:
htmlFile.write(self.soup.prettify())
def addData(self):
groupCounter = 0
index = 0
self.prepareData()
for i in range(8, 40):
#Check if we have a valid value in the ok column
if pd.notna(self.df['ok'].values[i]) and str(self.df['ok'].values[i]) != "0":
td = self.soup.find(id = 'ok' + str(index))
td.string = str(self.df['ok'].values[i])
#Check if we have a valid value in the accumulate column
if pd.notna(self.df['acc'].values[i]) and str(self.df['acc'].values[i]) != "0":
td = self.soup.find(id = 'acc' + str(index))
td.string = str(self.df['acc'].values[i])
#Check if we have a valid value in the nok column
if pd.notna(self.df['nok'].values[i]) and str(self.df['nok'].values[i]) != "0":
td = self.soup.find(id = 'nok' + str(index))
td.string = str(self.df['nok'].values[i])
#Check if we have a valid value in the problem column
if pd.notna(self.df['prob'].values[i]):
td = self.soup.find(id = 'problem' + str(index))
td.string = str(self.df['prob'].values[i])
if groupCounter == 3:
index += 1
groupCounter = 0
else:
groupCounter += 1
The excel I'm working with is a bit strange hence why I perform so many (seemingly) redundant operations. Still, it has to remain in its current form.
The main thing is the fact that the 'rows' that contain data is actually formed out of 4 regular rows, hence the need for groupCounter.

Found a workaround for this problem. Basically what I did was move the loop in the batch script, as so:
:whileLoop
python "C:\Users\Me\PythonScripts\excelToHtmlTransfer.py"
timeout /t 900 /nobreak
goto :whileLoop
After leaving it to run for a few hours the situation didn't occur anymore, however unfortunately I still don't know what caused it.

Related

Could not scrap some values from json file in python

I would like scrap the data from json file, however I could not scrap the availability ("available" in json file) of the json value. The other values are scrapped sucessfully.
It shown blank on the column.
varavailability= "" if i >= len(variants) else variants[i].get('available', '')
import asyncio
import os
import random
import time
import openpyxl
import aiohttp
from urllib import request
# path="C:/Users/pengoul/Downloads/dl"
path = os.getcwd()
print(f"CWD is {path}")
path = os.path.join(path, "download")
if not os.path.exists(path):
os.makedirs(path)
# picpath= os.makedirs('picture')
async def request():
async with aiohttp.ClientSession() as session:
async with session.get(url='https://hiutdenim.co.uk/products.json?limit=500') as resp:
html = await resp.json()
k = list()
f = openpyxl.Workbook()
sheet = f.active
sheet.append(['Name', 'Barcode', 'Product Category', 'Image', 'Internal Reference', 'Sales Price','Product Tags'])
products = []
print("Saving to excel ...")
for i in html['products']:
title = i.get('title')
id1 = i.get('id')
product_type = i.get('product_type')
images = [img.get('src', '') for img in i.get('images', [])]
products.append((title, id1, product_type, images))
variants = [var for var in i.get('variants')]
for i in range(max(len(images), len(variants))):
imgsrc = "" if i >= len(images) else images[i]
varsku = "" if i >= len(variants) else variants[i].get('sku', '')
varprice = "" if i >= len(variants) else variants[i].get('price', '')
varavailability= "" if i >= len(variants) else variants[i].get('available', '')
sheet.append([title, "'" + str(id1), product_type, imgsrc, varsku, varprice, varavailability])
f.save(f"result230102.xlsx")
print("Downloading images ...")
for product in products:
title, id1, product_type, images = product
for seq, imgurl in enumerate(images):
print(f"Downloading img for {id1} ({seq + 1}/{len(images)})")
request.urlretrieve(imgurl, os.path.join(path, f"{id1}-{seq + 1}.jpg"))
async def download(url):
image = url[0]
file_name = f'{url[1]}.jpg'
print(f'picpath/{file_name}')
async with aiohttp.ClientSession() as session:
time.sleep(random.random())
async with session.get(image) as resp:
with open(path+ file_name, mode='wb') as f:
f.write(await resp.content.read())
# print(f'picpath/{file_name}')
async def main():
if not os.path.exists(path):
os.mkdir(path)
tasks = []
await request()
# for url in urls:
# tasks.append(asyncio.create_task(download(url)))
# await asyncio.wait(tasks)
if __name__ == '__main__':
print(os.getpid())
t1 = time.time()
urls = []
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.time()
print('total:', t2 - t1)
It shown blank on this column.
I would like to scrap the values of "available" from json.
I ran your code in my debugger, putting a breakpoint at the line in question. This breakpoint is hit many times during execution. In some cases, it produces a True value for varavailability as you're expecting.
At some point, this line ends up executing when the value of i is 1 and the length of variants is also 1. In this case, per the if condition if i >= len(variants), the variable varavailability is set to "". i is allowed to have a value of 1 because the length of images in this case is 5. In this case, your loop for i in range(max(len(images), len(variants))): will iterate over i == 0 to i == 4. For each i value greater than 0, varavailability will be set to "". I can't be sure if this is the case you're wondering about, but it makes good sense that it is.
UPDATE:
As to how to fix this, the question centers on how the contents of variants and images relate to each other and on what you are doing in your loop:
for i in range(max(len(images), len(variants))):
imgsrc = "" if i >= len(images) else images[i]
varsku = "" if i >= len(variants) else variants[i].get('sku', '')
varprice = "" if i >= len(variants) else variants[i].get('price', '')
varavailability= "" if i >= len(variants) else variants[i].get('available', '')
sheet.append([title, "'" + str(id1), product_type, imgsrc, varsku, varprice, varavailability])
It seems that the code is iterating over a list of products, and each product has two lists associated with it, a list of images, and a list of variants. My guess is that the contents of these two lists are independent...that each value in images does not correspond to a particular entry in variants.
If what you want is a table of product variants, one possible solution is to associate all of the images for a particular product with each of the variations of that product, and then just iterate over each of the variants. That could be something like this:
imgsrc = " ".join(images)
for variant in variants:
varsku = variants.get('sku', '')
varprice = variants.get('price', '')
varavailability = variants.get('available', '')
sheet.append([title, "'" + str(id1), product_type, imgsrc, varsku, varprice, varavailability])

Finding text string with pfdminer not consistent [Python]

I've got a question about a code that's getting text string from a pdf file and returns the output in a .csv
The output is stored in Output.csv. Like you can see it returns value on p.27 here the code works and 29, p. 28 is missing. What i want to return is textstring on p. 28 code not working.
Can somebody tell me what im doing wrong? In the 2nd code pdfminer does read out the proper output that is needed.
import re, csv, os
import sys, time
from tqdm import tqdm
import multiprocessing as mp
from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
# method 3: object oriented programming
class Program:
#initialisation (happens when Program() is called for the first time)
def __init__(self):
# locations
# this defines the location of the workspace and directory of the data to process
self.ws_loc = Path("C:/Users/pco/Desktop/workspace")
self.dat_loc = Path("C:/Users/pco/Desktop/workspace/data/Test")
# lookuptable
# this converts the lookuptable from maximo to a list which can be used for comparison
self.lu_file = self.ws_loc / "lookuptable.csv"
with open(self.lu_file, newline='') as f:
reader = csv.reader(f)
self.lu_list = list(filter(None,list(reader)))
self.lu_list = [each[0] for each in self.lu_list]
def listener(self,q):
'''listens for messages on the q (queue), writes (appends) to file (output.csv). '''
# open output.csv in location workspace/data/ and use as 'f'
with open(self.ws_loc / 'output.csv', 'a') as f:
#start infinite listening loop until 'kill' message is received
while 1:
# get the message which is first in q (queue)
m = q.get()
# break loop if message is kill and close file 'output.csv'
if m == 'kill':
f.close()
break
# if message is not 'kill' then write message to file and flush file
f.write(m)
f.flush()
def worker(self, file, q):
''' processes a pdf file given by main() and writes output to q (queue)'''
# init PDF class (this class is used to get pages from the PDF and process pdftext)
PDF = self.PDF(self.dat_loc,self.lu_list,0)
# get all the pages from PDF: contains pages = [page1, ..., pageN]
# pageN = "bla bla \n bla etc."
PDFpages = PDF.getPages(file)
pages = []
for page in PDFpages:
pages.append(page)
# varargs defines extra data for files (this is where metadata is stored)
# varargs should not be filled here, but it is initialized here.
varargs = ''
# check if file is a manual (this can be seen as an example for a varargs entry)
# it should contain atleast ',' (this creates a new column entry in the csv)
# PDF.fileCategory() which is a class within the Program class, can be taken as an example
varargs+= PDF.fileCategory(file,pages) + ',' + PDF.fileSupplier(file, pages) + ',' + PDF.fileRev(file, pages)
# new vararg can be added like: varargs+= THE_VARARG
# initialise pageNum (which is a page number identifier inside the for loop)
pageNum = 1
# create an empty datastack (which is the message that will be send to q (queue))
datastack = ''
# for each page do...
for page in pages:
'''!!! for each page look for tags (THIS IS WHERE THE REGEX HAPPENS PDF.find_tag()) !!!'''
found_strings, found = PDF.find_tag(page)
# found_stringsrev, foundrev = PDF.find_rev(page)
# if tags are found, then fix the tags such that they are correct with
# Program.putStripe() (or self.putStripe()) it changes 12AB1234A to 12-AB-1234-A
# if foundrev:
# string = ''
# fixedstring = ''
# for stringrev in found_stringsrev:
# # fill datastack with found tags
# datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + ',' + stringrev + ',' + varargs + '\n'
if found:
for string in found_strings:
# if correct, do not change
fixedstring = string
# check if the tag matches the correct regexpression ('regex' or 're')
if re.match('^(\d{1,2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', string)!=None:
# else fix the tag
fixedstring = self.putStripe(string)
# fill datastack with found tags
datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + varargs + '\n'
# next page, so pageNum becomes pageNum + 1
pageNum +=1
# if the datastack is empty, we are still interested in the varargs:
# (so empty tag columns are added)
if datastack=='':
datastack = file + ',' + ',' + ',' + varargs + '\n'
# put the datastack message inside of the q (queue)
q.put(datastack)
# terminate the PDF class so that the pdf file is closed in a correct way
PDF.terminate()
# return (in case the datastack should be printed)
return datastack
def putStripe(self,input):
'''This function fixes a tag that is not correct'''
# strip the tag from spaces
input = re.sub(' ','',input)
# for each string that matches the expression write to words
words = re.findall('[0-9][A-Za-z]+', input)
words += re.findall('[A-Za-z][0-9]+', input)
# for each match inside the tag add a '-' in the second position
for word in words:
i = input.find(word)+1
input = input[:i] + '-' + input[i:]
# return the fixed tag
return input
def main(self):
try:
# initiate time
t = time.time()
# create pools for paralell pooling (max cpu threads is optained automatically)
pool = mp.Pool(mp.cpu_count() + 2)
# create a manager
manager = mp.Manager()
# from the pool manager create a queue object which can be used to
# exchange data between the worker and listener
q = manager.Queue()
# start up listener first
# ignore warning, it is being used
watcher = pool.apply_async(self.listener, (q,))
# fire off workers (basically assign them jobs)
jobs = []
# NOTE: FOR LOOPS DO NOT CAUSE A LOOP, CODE PROCEEDS WITH PARALLEL THREADING
# AS IF THE RESULT OF EACH LOOP IS INSTANTLY COMPLETED
# each file in the data location is a job
for file in os.listdir(self.dat_loc):
# assign the job to a worker
job = pool.apply_async(self.worker, (file, q))
# append the job to jobs (for data aquisition)
jobs.append(job)
# this is used to get the data back from jobs
for job in tqdm(jobs):
#print('')
#print(job.get()[:-1])
job.get()
# printed elapsed time (good for project management)
print('elapsed time = ' + str(time.time()-t) + ' seconds')
# catch interupt and try to properly terminate workers (might take time)
# best to just do everything in batches and dont interrupt
except KeyboardInterrupt:
print("\nCaught KeyboardInterrupt, terminating workers")
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
pool.terminate()
SystemExit(1)
# always excecute (kills workers and listener)
finally:
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
def execute(self):
self.main()
class PDF:
# from PDF.
def __init__(self,dat_loc,lu_list,maxpages):
self.dat_loc = dat_loc
self.lu_list = lu_list
self.lu_list_f = 0
self.password = ""
self.maxpages = maxpages
self.caching = True
self.rsrcmgr = PDFResourceManager()
self.retstr = StringIO()
self.laparams = LAParams()
self.device = TextConverter(self.rsrcmgr, self.retstr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.pagenos=set()
# from PDF.
def getPages(self,file):
self.fp = open(self.dat_loc / file, 'rb')
pages = PDFPage.get_pages(self.fp,
self.pagenos,
maxpages=self.maxpages,
password=self.password,
caching=self.caching,
check_extractable=True)
return pages
# from PDF.
def fileCategory(self,file,pages):
rules = []
rules.append(['Manual',['ANLAGE - INSTALLATION','User Guide','MANUAL','Manual','manual','Handleiding','handleiding','Instruction','instructions','Instructie', 'Guide', 'GUIDE']])
rules.append(['Specification',['SPECIFICATION','Specification','Specificatie']])
rules.append(['Datasheet',['DATA BOOK','UTILITIES LIST','DATA PACKAGE','Data Package','data-sheet','Datasheet','DATASHEET','datasheet','DATA SHEET','Data Sheet','Data sheet','data sheet']])
rules.append(['Spare part list',['SPARE PARTS LIST']])
rules.append(['Invoice',['BILL OF MATERIAL','invoice','Invoice','INVOICE','Purchase order','Purchase Order','PURCHASE ORDER']])
rules.append(['Schematic Diagram',['SCHEMATIC DIAGRAM','Schematic Diagram','Schematic diagram', 'ISOMETRIC', 'Isometric', 'isometric']])
rules.append(['Checklist', ['Checklist', 'CHECKLIST', 'CHECKSHEET', 'Checksheet']])
rules.append(['Certificates', ['Certificate', 'CERTIFICATE', 'Zertifikat', 'ZERTIFIKAT', 'Certificat', 'CERTIFICAT']])
rules.append(['Required documents list', ['REQUIRED SUBMITTAL DOCUMENTS']])
fileCategory = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileCategory += ',' + category
else:
fileCategory += ',' + 'Unreadable'
return fileCategory
# from PDF.
def fileSupplier(self,file,pages):
rules = []
rules.append(['JE Jacobs',['JE Jacobs', 'JE JACOBS', 'Jacobs', 'JACOBS']])
rules.append(['Emerson',['Emerson', 'Emerson Process Management', 'EMERSON',]])
rules.append(['Air Liquide',['Air Liquide', 'AIR LIQUIDE']])
rules.append(['Rosemount',['ROSEMOUNT', 'Rosemount']])
rules.append(['Deltak',['Deltak', 'DELTAK']])
rules.append(['AviComp',['AVICOMP', 'Avicomp', 'avicomp']])
fileSupplier = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileSupplier += ',' + category
else:
fileSupplier += ',' + 'Supplier N/A'
return fileSupplier
# from PDF.
def fileRev(self,file,pages):
fileRev = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for line in self.pagestr(page).splitlines():
if re.match('^(Rev.*).*$', line):
found = True
if found:
break
if found:
break
counter+=1
if found:
fileRev += ',' + line
else:
fileRev += ',' + ''
return fileRev
# from PDF.
def find_string_lookup(self,page,pageNum,file,varargs):
datastack = []
data = []
found = False
for line in self.pagestr(page).splitlines():
line = re.sub('[^A-Za-z0-9]+', '', line)
counter = 0
for tag in self.lu_list_f:
if line.find(tag)!=-1:
found = True
data = file + ',' + str(self.lu_list[counter][0]) + ',' + str(pageNum) + varargs +'\n'
if data not in datastack:
datastack += [data]
counter+=1
return datastack, found
# from PDF.
def find_string(self,page,strings,Method=None):
datastack = []
data = []
found = False
if Method=='ALPHABET_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line) for line in strings]
elif Method=='ALPHABETCAPS_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line).upper() for line in strings]
elif Method=='ALPHABETCAPS':
tags = [line.upper() for line in strings]
else:
tags = strings
for line in self.pagestr(page).splitlines():
if Method=='ALPHABET_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line)
elif Method=='ALPHABETCAPS_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line).upper()
elif Method=='ALPHABETCAPS':
line = line.upper()
i = 0
for tag in tags:
if tag != '':
if line.find(tag)!=-1:
found = True
data = strings[i]
if data not in datastack:
datastack += [data]
i+=1
return datastack, found
# from PDF.
def find_tag(self,page):
datastack = []
found = False
for line in self.pagestr(page).splitlines():
tags = re.findall('^(\d{2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', line)
for tag in tags:
if tag not in datastack:
datastack += [tag]
found = True
return datastack, found
# from PDF.
# def find_rev(self,page):
# datastack = []
# found = False
# for line in self.pagestr(page).splitlines():
# tags = re.findall('^(Rev.*).*$', line)
# for tag in tags:
# if tag not in datastack:
# datastack += [tag]
# found = True
# return datastack, found
# from PDF.
def pagestr(self,page):
self.retstr.truncate(0)
self.retstr.seek(0)
self.interpreter.process_page(page)
return self.retstr.getvalue()
# from PDF.
def terminate(self):
self.fp.close()
self.device.close()
self.retstr.close()
# start the code (the proper way)
if __name__ == '__main__':
Program().execute()
If i read out the pdf with this code in python (also with pdfminer):
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
class glb():
workspace_folder = Path('C:/Users/pco/Desktop/workspace')
data_folder = Path('C:/Users/pco/Desktop/workspace/data/Test')
lookup_file = workspace_folder / "lookuptable.csv"
with open(lookup_file, newline='') as f:
reader = csv.reader(f)
lookup_list = list(reader)
lookup_list_filtered = list(filter(None,[re.sub('[^A-Za-z0-9]+', '', str(line)) for line in lookup_list]))
def find_tagnumbers(path):
pagelines = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
page_no = 1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
page_str = retstr.getvalue()
pagelines.append(page_str.splitlines())
retstr.truncate(0)
retstr.seek(0)
page_no +=1
page_no +=-1
print(pagelines)
fp.close()
device.close()
retstr.close()
return 1
find_tagnumbers('C:/Users/pco/Desktop/workspace/data/Test/1845613_1_27_Marked.pdf')
it does returns 47-AT -0053. But if i run the code below it doesn't return the value in output file. Output when i print pagelines
p.s. my coding skills is beginner (so i write out all the steps)

Pandas and CSV Libraries CSV Manipulation

I am building a simple app.
I want some values in my CSV to be updated every 15 minutes. I want
this part of my app to run in the background to prevent blocking the interface.
I couldn't get it to work the way I want to.
My code:
# I'm using the pandas, sched and time imports here:
#INTERFACE
#app.route("/")
def home():
s = sched.scheduler(time.time, time.sleep)
s.enter(15, 1, timer)
s.run()
return render_template("index.html")
#TIMER
def timer():
timer = 0
csv_file='C:\Python27\Walmart\sheet.csv'
print("UPDATING")
data_df = pd.read_csv(csv_file)
print("READ")
for i, row in data_df.iterrows() :
sku = data_df.iloc[i]['Walmart SKU']
print (sku)
if sku is '':
break
else:
update(sku)
print("Item Updated")
print("UPDATECOMPLETE")
home()
#UPDATE
def update(sku):
lookup=str(sku)
lookup = lookup.replace('.0', '')
product = wapy.product_lookup(lookup)
ts = time.time()
st = datetime.datetime.today().strftime('%Y-%m-%d %I:%M %p')
print (product.name)
if product.available_online is 'TRUE':
instock = 'yes'
else:
instock ='no'
quote_page = product.product_url
page = urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
sold_box = soup.find('a', attrs={'class': 'font-bold prod-SoldShipByMsg'})
sold = sold_box.text.strip()
left_box = soup.find('div', attrs={'class': 'prod-ProductOffer-urgencyMsg'})
left = left_box.text.strip()
if left is '':
stock=product.stock
else:
stock=left
# fields=[lookup + ',' + '$'+str(product.sale_price) + ',' + instock + ',' + stock + ',' + str(sold) + ',' + st + ',' + '$'+str(product.msrp)]
pathto_csv = 'C:\Python27\Walmart\sheet.csv'
data_df = pd.read_csv(pathto_csv)
print("CSV READ")
data_df.set_value([lookup], ['Price'], '$'+str(product.sale_price))
data_df.set_value([lookup], ['In Stock'], instock )
data_df.set_value([lookup], ['Quantity'], stock)
data_df.set_value([lookup], ['Last Update'], str(sold))
data_df.set_value([lookup], ['Min Price'], '$'+str(product.msrp))
data_df.to_csv(pathto_csv)
with open(r'sheet.csv', 'a') as f:
writer = csv.writer(f, delimiter=' ', quotechar = ' ')
writer.writerow(fields)
print(st)
print("UPDATED! 15 Minutes Have Passed!")
I have two problems:
1. On replacing the value on my update I'm getting this error:
KeyError: "['879091509'] not in index"
I thought that [row],[column] would let me replace the value I want for that cell, for example:
data_df.set_value([lookup], ['Price'], '$'+str(product.sale_price))
I read this as:
on row of where my SKU or ID is represented by ['lookup']
replace the ['Price'] by '$'+str(product.sale_price)
2. I can't get to my interface.
I think this is because:
when the time resets it runs the code again.
Question: How can I make this run only in background?
Q1
There are two issues:
The first argument set_value searches the DataFrame's index, but your DataFrame doesn't have an index.
You're passing lists to set_value but it just needs values.
To fix, first set an index:
data_df = data_df.set_index('Walmart SKU')
And remove the lists from set_value:
data_df.set_value(lookup, 'Price', '$'+str(product.sale_price))
Q2
Is not clear enough to answer. Try asking a new question with a minimum test case.

List index out of range error in breaking whiloe loop in python

Hi I am new to python and struggling my way out. Currently ia m doing some appending excel files kind of task and here's my sample code. Getting list out of index error as according to me while loop is not breaking at rhe end of each excel file. Any help would be appreciated. Thanks:
import xlrd
import glob
import os
import openpyxl
import csv
from xlrd import open_workbook
from os import listdir
row = {}
basedir = '../files/'
files = listdir('../files')
sheets = [filename for filename in files if filename.endswith("xlsx")]
header_is_written = False
for filename in sheets:
print('Parsing {0}{1}\r'.format(basedir,filename))
worksheet = open_workbook(basedir+filename).sheet_by_index(0)
print (worksheet.cell_value(5,6))
counter = 0
while True:
row['plan name'] = worksheet.cell_value(1+counter,1).strip()
row_values = worksheet.row_slice(counter+1,start_colx=0, end_colx=30)
row['Dealer'] = int(row_values[0].value)
row['Name'] = str(row_values[1].value)
row['City'] = str(row_values[2].value)
row['State'] = str(row_values[3].value)
row['Zip Code'] = int(row_values[4].value)
row['Region'] = str(row_values[5].value)
row['AOM'] = str(row_values[6].value)
row['FTS Short Name'] = str(row_values[7].value)
row['Overall Score'] = float(row_values[8].value)
row['Overall Rank'] = int(row_values[9].value)
row['Count of Ros'] = int(row_values[10].value)
row['Count of PTSS Cases'] = int(row_values[11].value)
row['% of PTSS cases'] = float(row_values[12].value)
row['Rank of Cases'] = int(row_values[13].value)
row['% of Not Prepared'] = float(row_values[14].value)
row['Rank of Not Prepared'] = int(row_values[15].value)
row['FFVt Pre Qrt'] = float(row_values[16].value)
row['Rank of FFVt'] = int(row_values[17].value)
row['CSI Pre Qrt'] = int(row_values[18].value)
row['Rank of CSI'] = int(row_values[19].value)
row['FFVC Pre Qrt'] = float(row_values[20].value)
row['Rank of FFVc'] = int(row_values[21].value)
row['OnSite'] = str(row_values[22].value)
row['% of Onsite'] = str(row_values[23].value)
row['Not Prepared'] = int(row_values[24].value)
row['Open'] = str(row_values[25].value)
row['Cost per Vin Pre Qrt'] = float(row_values[26].value)
row['Damages per Visit Pre Qrt'] = float(row_values[27].value)
row['Claim Sub time pre Qrt'] = str(row_values[28].value)
row['Warranty Index Pre Qrt'] = str(row_values[29].value)
counter += 1
if row['plan name'] is None:
break
with open('table.csv', 'a',newline='') as f:
w=csv.DictWriter(f, row.keys())
if header_is_written is False:
w.writeheader()
header_is_written = True
w.writerow(row)
In place of while True use for.
row['plan name'] = worksheet.cell_value(1 + counter, 1).strip()
row_values = worksheet.row_slice(counter + 1, start_colx=0, end_colx=30)
for values in row_values:
row['Dealer'] = int(values.value)
row['Name'] = str(values.value)
....
because while True means to run this loop infinite time.(or until it means break keyword) inside while loop
Read more about while loop
while True loop basically means: execute the following code block to infinity, unless a break or sys.exit statement get you out.
So in your case, you need to terminate after the lines to append the excel are over (exhausted). You have two options: check if there are more lines to append, and if not break.
A more suitable approach when writing a file is for loops. This kind of a loop terminates when it is exausted.
Also, you should consider gathering the content of the excel in one operation, and save it to a variable. Then, once you have it, create iteration and append it to csv.

Extract data from web page

I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).

Categories