Shoving Scrapy Objects into array when parsing; what am I doing wrong? - python

I've basically created a spider that follows a set of links acquired from an API, and then extracts text from the HTML body. I'm trying to append returned items to appropriate lists, which are then added to a dictionary. When I run the code, the resultant JSON file only successfully writes the first line.
I am running Python 3.6 in a virtual environment on a Windows 10 64-bit machine, and I run pip-upgrade daily.
from nltk.corpus import stopwords
import smtplib
from time import sleep # To prevent overwhelming the server between connections
from bs4 import BeautifulSoup as soup
import scrapy
import mysql.connector as mariadb
import sys
from collections import Counter
from pprint import pprint
import json
import re
conn = mariadb.connect(user=dbuser, password=dbpassword, database=dbdatabase)
c = conn.cursor()
e = sys.exc_info()[0]
c.execute("Select URL FROM [TABLE]")
JobURLs = c.fetchall()
for object in JobURLs:
urls = []
url_string = str(object)
rx = re.compile('\W\W\W$')
res = rx.sub('', url_string)
rx = re.compile('^\W\W')
url = rx.sub('', res)
urls.append(url)
c.execute("Select JvId FROM [TABLE]")
JobIDs = c.fetchall()
for object in JobIDs:
item = {}
item['JvId'] = []
JobID_string = str(object)
rx = re.compile('\W\W\W$')
res = rx.sub('', JobID_string)
rx = re.compile('^\W\W')
JobID = rx.sub('', res)
item['JvId'].append(JobID)
class JobListing(scrapy.Spider):
name = 'JobListingCrawler'
start_urls = urls
def parse(self, response):
# pass
item['urlText'] = response.url
page_html = response.body
page_soup = soup(page_html, 'lxml')
for script in page_soup(['script', 'style']):
script.extract()
item['jobDescText'] = page_soup.get_text('''\n''', strip=True)
## TextCleaner Function for Word Counter
text = item['jobDescText'].replace('\n', ' ')
lines = [line.strip() for line in text.splitlines()]
chunks = [phrase.strip() for line in lines for phrase in line.split(' ')]
def chunk_space(chunk):
chunk_out = chunk + ' '
return chunk_out
text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8')
try:
text = text.decode('unicode_escape').encode('ascii', 'ignore')
except:
print(e)
pass
text = re.sub('[^a-zA-Z,+3]', ' ', str(text))
text = text.lower().split()
stop_words = set(stopwords.words('english'))
text = [word for word in text if not word in stop_words]
wordCounter = Counter(text)
item['wordCounter'] = str(wordCounter)
## And now we parse for email addresses!
prog = re.compile(r"[A-z0-9._%+-]+#[A-z0-9.-]+\.[A-z]{2,}")
found = prog.search(item['jobDescText'].replace('\n', ' '))
try:
item['email'] = str(found.group(0))
except:
item['email'] = 'null'
pass
filename = 'results.jl'
line = json.dumps(dict(item)) + '\n'
with open(filename, 'a') as f:
f.write(line)
self.log('Saved Line to %s' % filename)

You just need to declare a Scrapy Item, which contains yours returned fields definion.
After that, just need to config your setting file to allow Scrapy Feed Exports using the built-in JsonItemExporter for your extract data:
FEED_URI: file:///tmp/export.json
FEED_FORMAT: json

So silly me: I put the list variable within the For Loop, so each time the actions looped it would delete the previously written values. Moving them outside of the loop solved the problem.
c.execute("Select URL FROM CareerOneStopJobs")
JobURLs = c.fetchall()
urls = []
for element in JobURLs:
url_string = str(element)
rx = re.compile('\W\W\W$')
res = rx.sub('', url_string)
rx = re.compile('^\W\W')
url = rx.sub('', res)
urls.append(url)
c.execute("Select JvId FROM CareerOneStopJobs")
JobIDs = c.fetchall()
item = {}
for JobID in JobIDs:
item['JvId'] = []
JobID_string = str(JobID)
rx = re.compile('\W\W\W$')
res = rx.sub('', JobID_string)
rx = re.compile('^\W\W')
JobID = rx.sub('', res)
item['JvId'] = JobID

Related

Printing list item above tqdm progress bar fails to replace the previous item instead adds a new line

I am attempting to read urls from a text file "url.com", append them to a list, then iterate the list to check if the url is an active website.
I am having problems with tqdm, I don't know to make the current list item replace the previous list item when iterating my group of urls.
Can someone guide me on how to properly replace the previous iterated item without a newline (with tqdm)?
import urllib
from tqdm.auto import tqdm, trange
import time
from urllib.request import urlopen
import sys
with open(r"millard.txt", 'r') as fp:
total_lines = len(fp.readlines())
print('Total Number of lines:', total_lines)
score_list = []
working = []
not_working = []
cntr = 0
num_lines = sum(1 for line in open('millard.txt', 'r'))
danky = "Progress"
with open('millard.txt', 'r') as f:
for i in tqdm(f, total=num_lines, position=0, leave=False, desc=danky):
# print(line)
# url = i.strip() # to remove the trailing \n
new_link = "http://www.{poop}".format(poop=i)
print("Getting %s" % new_link, end="")
# try block to read URL
try:
post = urllib.request.urlopen(new_link)
# print(post.__dict__)
working.append(new_link)
except urllib.error.HTTPError as e:
# print(e.__dict__)
fss = 1
except urllib.error.URLError as e:
# print(e.__dict__)
fss = 1
cntr += 1
time.sleep(0.01)
with open(r'final.txt', 'w') as fp:
for item in working:
# write each item on a new line
fp.write("%s" % item)
print('Done')
Tried a lot, just need the script to not print out a list of urls, instead print each url in the same line as the group/list etc. is being iterated.
from urllib.request import urlopen
from socket import error as SocketError
import errno
with open(r"millard.txt", 'r') as fp:
total_lines = len(fp.readlines())
print('Total urls loaded:', total_lines)
working = []
not_working = []
num_lines = sum(1 for line in open('millard.txt', 'r'))
urls = []
f = open('millard.txt', 'r+')
f1 = f.readlines()
for i in f1:
urls.append(i.rstrip())
for i in (pbar := tqdm(urls, total=num_lines, position=0, leave=True, desc=".")):
# print(line)
# url = i.strip() # to remove the trailing \n
new_link = "http://www.{poop}".format(poop=i)
# try block to read URL
import urllib.request
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
try:
req = urllib.request.Request(new_link, headers=hdr)
response = urllib.request.urlopen(req)
response.read()
code = " - success"
working.append(new_link)
except urllib.error.HTTPError as e:
# status = e.__dict__
code = " - failed"
except urllib.error.URLError as e:
# status = e.__dict__
code = " - failed"
except SocketError as e:
# if e.errno != errno.ECONNRESET:
code = " - failed"
pbar.set_description(f"Checking: {i}{code}")
with open(r'final.txt', 'w') as fp:
for item in working:
# write each item on a new line
fp.write("%s \n" % item)
with open(r"final.txt", 'r') as fp:
total_lines = len(fp.readlines())
print('Total urls loaded:', total_lines)

Finding text string with pfdminer not consistent [Python]

I've got a question about a code that's getting text string from a pdf file and returns the output in a .csv
The output is stored in Output.csv. Like you can see it returns value on p.27 here the code works and 29, p. 28 is missing. What i want to return is textstring on p. 28 code not working.
Can somebody tell me what im doing wrong? In the 2nd code pdfminer does read out the proper output that is needed.
import re, csv, os
import sys, time
from tqdm import tqdm
import multiprocessing as mp
from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
# method 3: object oriented programming
class Program:
#initialisation (happens when Program() is called for the first time)
def __init__(self):
# locations
# this defines the location of the workspace and directory of the data to process
self.ws_loc = Path("C:/Users/pco/Desktop/workspace")
self.dat_loc = Path("C:/Users/pco/Desktop/workspace/data/Test")
# lookuptable
# this converts the lookuptable from maximo to a list which can be used for comparison
self.lu_file = self.ws_loc / "lookuptable.csv"
with open(self.lu_file, newline='') as f:
reader = csv.reader(f)
self.lu_list = list(filter(None,list(reader)))
self.lu_list = [each[0] for each in self.lu_list]
def listener(self,q):
'''listens for messages on the q (queue), writes (appends) to file (output.csv). '''
# open output.csv in location workspace/data/ and use as 'f'
with open(self.ws_loc / 'output.csv', 'a') as f:
#start infinite listening loop until 'kill' message is received
while 1:
# get the message which is first in q (queue)
m = q.get()
# break loop if message is kill and close file 'output.csv'
if m == 'kill':
f.close()
break
# if message is not 'kill' then write message to file and flush file
f.write(m)
f.flush()
def worker(self, file, q):
''' processes a pdf file given by main() and writes output to q (queue)'''
# init PDF class (this class is used to get pages from the PDF and process pdftext)
PDF = self.PDF(self.dat_loc,self.lu_list,0)
# get all the pages from PDF: contains pages = [page1, ..., pageN]
# pageN = "bla bla \n bla etc."
PDFpages = PDF.getPages(file)
pages = []
for page in PDFpages:
pages.append(page)
# varargs defines extra data for files (this is where metadata is stored)
# varargs should not be filled here, but it is initialized here.
varargs = ''
# check if file is a manual (this can be seen as an example for a varargs entry)
# it should contain atleast ',' (this creates a new column entry in the csv)
# PDF.fileCategory() which is a class within the Program class, can be taken as an example
varargs+= PDF.fileCategory(file,pages) + ',' + PDF.fileSupplier(file, pages) + ',' + PDF.fileRev(file, pages)
# new vararg can be added like: varargs+= THE_VARARG
# initialise pageNum (which is a page number identifier inside the for loop)
pageNum = 1
# create an empty datastack (which is the message that will be send to q (queue))
datastack = ''
# for each page do...
for page in pages:
'''!!! for each page look for tags (THIS IS WHERE THE REGEX HAPPENS PDF.find_tag()) !!!'''
found_strings, found = PDF.find_tag(page)
# found_stringsrev, foundrev = PDF.find_rev(page)
# if tags are found, then fix the tags such that they are correct with
# Program.putStripe() (or self.putStripe()) it changes 12AB1234A to 12-AB-1234-A
# if foundrev:
# string = ''
# fixedstring = ''
# for stringrev in found_stringsrev:
# # fill datastack with found tags
# datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + ',' + stringrev + ',' + varargs + '\n'
if found:
for string in found_strings:
# if correct, do not change
fixedstring = string
# check if the tag matches the correct regexpression ('regex' or 're')
if re.match('^(\d{1,2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', string)!=None:
# else fix the tag
fixedstring = self.putStripe(string)
# fill datastack with found tags
datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + varargs + '\n'
# next page, so pageNum becomes pageNum + 1
pageNum +=1
# if the datastack is empty, we are still interested in the varargs:
# (so empty tag columns are added)
if datastack=='':
datastack = file + ',' + ',' + ',' + varargs + '\n'
# put the datastack message inside of the q (queue)
q.put(datastack)
# terminate the PDF class so that the pdf file is closed in a correct way
PDF.terminate()
# return (in case the datastack should be printed)
return datastack
def putStripe(self,input):
'''This function fixes a tag that is not correct'''
# strip the tag from spaces
input = re.sub(' ','',input)
# for each string that matches the expression write to words
words = re.findall('[0-9][A-Za-z]+', input)
words += re.findall('[A-Za-z][0-9]+', input)
# for each match inside the tag add a '-' in the second position
for word in words:
i = input.find(word)+1
input = input[:i] + '-' + input[i:]
# return the fixed tag
return input
def main(self):
try:
# initiate time
t = time.time()
# create pools for paralell pooling (max cpu threads is optained automatically)
pool = mp.Pool(mp.cpu_count() + 2)
# create a manager
manager = mp.Manager()
# from the pool manager create a queue object which can be used to
# exchange data between the worker and listener
q = manager.Queue()
# start up listener first
# ignore warning, it is being used
watcher = pool.apply_async(self.listener, (q,))
# fire off workers (basically assign them jobs)
jobs = []
# NOTE: FOR LOOPS DO NOT CAUSE A LOOP, CODE PROCEEDS WITH PARALLEL THREADING
# AS IF THE RESULT OF EACH LOOP IS INSTANTLY COMPLETED
# each file in the data location is a job
for file in os.listdir(self.dat_loc):
# assign the job to a worker
job = pool.apply_async(self.worker, (file, q))
# append the job to jobs (for data aquisition)
jobs.append(job)
# this is used to get the data back from jobs
for job in tqdm(jobs):
#print('')
#print(job.get()[:-1])
job.get()
# printed elapsed time (good for project management)
print('elapsed time = ' + str(time.time()-t) + ' seconds')
# catch interupt and try to properly terminate workers (might take time)
# best to just do everything in batches and dont interrupt
except KeyboardInterrupt:
print("\nCaught KeyboardInterrupt, terminating workers")
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
pool.terminate()
SystemExit(1)
# always excecute (kills workers and listener)
finally:
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
def execute(self):
self.main()
class PDF:
# from PDF.
def __init__(self,dat_loc,lu_list,maxpages):
self.dat_loc = dat_loc
self.lu_list = lu_list
self.lu_list_f = 0
self.password = ""
self.maxpages = maxpages
self.caching = True
self.rsrcmgr = PDFResourceManager()
self.retstr = StringIO()
self.laparams = LAParams()
self.device = TextConverter(self.rsrcmgr, self.retstr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.pagenos=set()
# from PDF.
def getPages(self,file):
self.fp = open(self.dat_loc / file, 'rb')
pages = PDFPage.get_pages(self.fp,
self.pagenos,
maxpages=self.maxpages,
password=self.password,
caching=self.caching,
check_extractable=True)
return pages
# from PDF.
def fileCategory(self,file,pages):
rules = []
rules.append(['Manual',['ANLAGE - INSTALLATION','User Guide','MANUAL','Manual','manual','Handleiding','handleiding','Instruction','instructions','Instructie', 'Guide', 'GUIDE']])
rules.append(['Specification',['SPECIFICATION','Specification','Specificatie']])
rules.append(['Datasheet',['DATA BOOK','UTILITIES LIST','DATA PACKAGE','Data Package','data-sheet','Datasheet','DATASHEET','datasheet','DATA SHEET','Data Sheet','Data sheet','data sheet']])
rules.append(['Spare part list',['SPARE PARTS LIST']])
rules.append(['Invoice',['BILL OF MATERIAL','invoice','Invoice','INVOICE','Purchase order','Purchase Order','PURCHASE ORDER']])
rules.append(['Schematic Diagram',['SCHEMATIC DIAGRAM','Schematic Diagram','Schematic diagram', 'ISOMETRIC', 'Isometric', 'isometric']])
rules.append(['Checklist', ['Checklist', 'CHECKLIST', 'CHECKSHEET', 'Checksheet']])
rules.append(['Certificates', ['Certificate', 'CERTIFICATE', 'Zertifikat', 'ZERTIFIKAT', 'Certificat', 'CERTIFICAT']])
rules.append(['Required documents list', ['REQUIRED SUBMITTAL DOCUMENTS']])
fileCategory = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileCategory += ',' + category
else:
fileCategory += ',' + 'Unreadable'
return fileCategory
# from PDF.
def fileSupplier(self,file,pages):
rules = []
rules.append(['JE Jacobs',['JE Jacobs', 'JE JACOBS', 'Jacobs', 'JACOBS']])
rules.append(['Emerson',['Emerson', 'Emerson Process Management', 'EMERSON',]])
rules.append(['Air Liquide',['Air Liquide', 'AIR LIQUIDE']])
rules.append(['Rosemount',['ROSEMOUNT', 'Rosemount']])
rules.append(['Deltak',['Deltak', 'DELTAK']])
rules.append(['AviComp',['AVICOMP', 'Avicomp', 'avicomp']])
fileSupplier = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileSupplier += ',' + category
else:
fileSupplier += ',' + 'Supplier N/A'
return fileSupplier
# from PDF.
def fileRev(self,file,pages):
fileRev = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for line in self.pagestr(page).splitlines():
if re.match('^(Rev.*).*$', line):
found = True
if found:
break
if found:
break
counter+=1
if found:
fileRev += ',' + line
else:
fileRev += ',' + ''
return fileRev
# from PDF.
def find_string_lookup(self,page,pageNum,file,varargs):
datastack = []
data = []
found = False
for line in self.pagestr(page).splitlines():
line = re.sub('[^A-Za-z0-9]+', '', line)
counter = 0
for tag in self.lu_list_f:
if line.find(tag)!=-1:
found = True
data = file + ',' + str(self.lu_list[counter][0]) + ',' + str(pageNum) + varargs +'\n'
if data not in datastack:
datastack += [data]
counter+=1
return datastack, found
# from PDF.
def find_string(self,page,strings,Method=None):
datastack = []
data = []
found = False
if Method=='ALPHABET_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line) for line in strings]
elif Method=='ALPHABETCAPS_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line).upper() for line in strings]
elif Method=='ALPHABETCAPS':
tags = [line.upper() for line in strings]
else:
tags = strings
for line in self.pagestr(page).splitlines():
if Method=='ALPHABET_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line)
elif Method=='ALPHABETCAPS_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line).upper()
elif Method=='ALPHABETCAPS':
line = line.upper()
i = 0
for tag in tags:
if tag != '':
if line.find(tag)!=-1:
found = True
data = strings[i]
if data not in datastack:
datastack += [data]
i+=1
return datastack, found
# from PDF.
def find_tag(self,page):
datastack = []
found = False
for line in self.pagestr(page).splitlines():
tags = re.findall('^(\d{2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', line)
for tag in tags:
if tag not in datastack:
datastack += [tag]
found = True
return datastack, found
# from PDF.
# def find_rev(self,page):
# datastack = []
# found = False
# for line in self.pagestr(page).splitlines():
# tags = re.findall('^(Rev.*).*$', line)
# for tag in tags:
# if tag not in datastack:
# datastack += [tag]
# found = True
# return datastack, found
# from PDF.
def pagestr(self,page):
self.retstr.truncate(0)
self.retstr.seek(0)
self.interpreter.process_page(page)
return self.retstr.getvalue()
# from PDF.
def terminate(self):
self.fp.close()
self.device.close()
self.retstr.close()
# start the code (the proper way)
if __name__ == '__main__':
Program().execute()
If i read out the pdf with this code in python (also with pdfminer):
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
class glb():
workspace_folder = Path('C:/Users/pco/Desktop/workspace')
data_folder = Path('C:/Users/pco/Desktop/workspace/data/Test')
lookup_file = workspace_folder / "lookuptable.csv"
with open(lookup_file, newline='') as f:
reader = csv.reader(f)
lookup_list = list(reader)
lookup_list_filtered = list(filter(None,[re.sub('[^A-Za-z0-9]+', '', str(line)) for line in lookup_list]))
def find_tagnumbers(path):
pagelines = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
page_no = 1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
page_str = retstr.getvalue()
pagelines.append(page_str.splitlines())
retstr.truncate(0)
retstr.seek(0)
page_no +=1
page_no +=-1
print(pagelines)
fp.close()
device.close()
retstr.close()
return 1
find_tagnumbers('C:/Users/pco/Desktop/workspace/data/Test/1845613_1_27_Marked.pdf')
it does returns 47-AT -0053. But if i run the code below it doesn't return the value in output file. Output when i print pagelines
p.s. my coding skills is beginner (so i write out all the steps)

How to get the URL from local files?

I used Wget to download webpages.
I want to ask "Is it possible to get the URL from local html files?"
I used python to analysis the html file content. I want to print all files' URL.
I am trying to add more functions on this program, so I think if I can print the URL follow result, then user can easy click link to get the webpage.
Here is my code:
def search(self):
keyword = self.entry.get()
mypath = "/Users/Tsu-AngChou/MasterProject/Practice/try_test/"
files = listdir(mypath)
translator = str.maketrans("","",string.punctuation)
count1 = 0
test_list = []
test_list2 = []
for f in files:
fullpath = join(mypath, f)
if f == '.DS_Store':
os.remove(f)
elif isfile(fullpath):
# print(f)
for html_cont in range(1):
response = open(f,'r',encoding='utf-8')
html_cont = response.read()
soup = bs(html_cont, 'html.parser')
regular_string = soup.get_text()
new_string = regular_string.translate(translator).split()
new_list = [item[:14] for item in new_string]
a = dict.fromkeys(new_list, f)
wordfreq = []
c = new_list
for w in c:
wordfreq.append(c.count(w))
fre = dict(zip(c,wordfreq))
sentence= new_list
keyword1= keyword
words = sentence
if keyword in fre:
test_list.append(a[keyword])
test_list2.append(fre[keyword])
count1 = count1+1
for (i, subword) in enumerate(words):
if (subword == keyword1):
test_list3= i+1
for i in range(0,count1-1):
for j in range(0,count1-1-i):
if (test_list2[j]<test_list2[j+1]):
temp=test_list[j]
temp2=test_list2[j]
test_list[j]=test_list[j+1]
test_list2[j]=test_list2[j+1]
test_list[j+1]=temp
test_list2[j+1]=temp2
for i in range(0,count1):
print(keyword, "Filename:", test_list[i], "Frequency:", test_list2[i])
return a
This is my output, and I want to have the link follow every result.

How can i parse the number in JSON to PYTHON in Intrinio?

I am having trouble parsing the market cap (it is a number) from an API call. I get the data, but for purposes of the program i am developing i just need the number and not the other stuff.
import csv
import intrinio
import requests
import json
import re
api_username = "b9cb2b8cbda8dde39f27a21f66e12afd"
api_password = "6d71a6dd01dd554f92a03f0e1b40dd44"
# CSV_URL = 'https://api.intrinio.com/financials/reported.csv?identifier=AAPL&statement=income_statement&fiscal_year=2015&fiscal_period=FY'
# CSV_URL2 = 'https://api.intrinio.com/financials/standardized.csv?identifier=AAPL&statement=balance_sheet&type=FY&fiscal_period=FY&date=2017-05-20'
CSV_URL3 = 'https://api.intrinio.com/data_point?identifier=AAPL&item=marketcap'
with requests.Session() as s:
download = s.get(CSV_URL3,auth=(api_username, api_password))
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
my_list = list(cr)
fx = open(r'test3.csv', 'w')
for item in my_list:
fx.write("%s\n" % item)
str1 = my_list[0][2]
num = re.findall('\d+', str1)
final_number = float(num[0])
print(final_number)

Extract data from web page

I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).

Categories