Python beautifulsoup parsing speed improvement

Python beautifulsoup parsing speed improvement - python

Currently I have written my first python script in order to loop through some URL's listed in a CSV. over 14,000 links. I am trying to 1) get all the keyword tags 2) check page status (404 links need to get flagged). 3) convert youtube videos into the embed youtube link ( after maybe going to the webpage getting the keywords and then converting into the embed link )
It is going so slow but I can not figure out a faster way. I feel like it is the request.get() attribute but I dont know how i can speed it up. I only need the meta data but is their a way to only get the beginning of the page and not all of it? How do i make this code better / faster / optimized.
Also when compiling using pyinstaller I receive a collections problem. I feel like I am using python 2 code in python 3.. as i am writing using python 3.5
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
linkLocation = r'C:\Users\JCModern\Desktop\content_links.csv'
source_code = ''
myURL = ''
timestr = time.strftime("%Y%m%d_%H%M%S")
newfilename = r'C:\Users\JCModern\Desktop\content_links_and_keywords_' + timestr + '.csv'
with open(newfilename, "w", newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(('cmsid', 'filepath', 'metatags', 'pageurl', 'pageurlchange'))
file.close()
with open(linkLocation, "r", encoding="utf-8-sig") as f:
csv_f = csv.reader(f, delimiter=",")
next(csv_f, None)
for row in csv_f:
if len(row) != 0:
# init variables
myKeywords = ""
myTitle = ''
myURL = ''
pageUrlChange = ''
pageStatus = ''
pageUrl = ''
myCmsid = (row[0])
myURL = (row[2])
if "https://www.youtube.com/embed/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[4]
youtubeurl = re.sub(
r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/watch?v=' + youtubeurl
try:
source_code = requests.get(myURL)
except Exception:
with open('errors.txt', 'a', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow((myCmsid, myURL))
file.close()
pageStatus = source_code.status_code
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
pageStatus = str(pageStatus)
pageStatus = pageStatus[:1]
pageStatus = int(pageStatus)
if pageStatus == 2:
pageUrlChange = 0
else:
pageUrlChange = 1
if pageStatus == 3:
pageUrl = source_code.url
l = soup.findAll("meta", attrs={"name": "keywords"})
if l is None:
myKeywords = ""
else:
try:
myKeywords = l[0]['content']
except:
myKeywords = myKeywords
myKeywords = myKeywords.replace(', ', '~')
myKeywords = myKeywords.replace(',', '~')
myKeywords = myKeywords.replace('(', '')
myKeywords = myKeywords.replace(')', '')
if soup.find('title'):
myTitle = soup.find('title').string
if "https://www.youtube.com/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[3]
youtubeurl = re.sub(r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/embed/' + youtubeurl
# print(youtubeurl)
if "https://youtu.be/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[3]
youtubeurl = re.sub(
r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/embed/' + youtubeurl
# print(youtubeurl)
# print((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
with open(newfilename, "a", newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
file.close()
f.close()

html.parser is a pure-python implementation using regular expressions. You really don't want to use it. Install lxml and have the parsing done in C code instead (do remember to then use BeautifulSoup(plain_text, 'lxml').
You also don't want to keep re-opening your CSV file. Open it once, outside your loop, and simply write new rows to the csv.writer() object in your loop.
You can't otherwise speed up URL loading, not much. Network speed is always going to be a bottleneck. You could use the very low-level PyCurl library, but I doubt the speedups it can offer are going to have an impact here.

In addition to the excellent suggestion to move to a faster xml parser, this is a good candidate for parallelization via the multiprocessing module. I've rearranged your code to do the request/parsing in a worker that can be delegated to a subprocess. The worker returns the row needed to be added to the csv. I added a 0/-1 error code to the front of the returned row so the parent process knows which csv gets the result.
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
import multiprocessing
import traceback
def grabber(myCmsid, myURL):
try:
return grabber_impl(myCmsid, myURL)
except:
return (-1, myCmsid, myURL, traceback.format_exc())
def grabber_impl(myCmsid, myURL):
# init variables
myKeywords = ""
myTitle = ''
myURL = ''
pageUrlChange = ''
pageStatus = ''
pageUrl = ''
if "https://www.youtube.com/embed/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[4]
youtubeurl = re.sub(
r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/watch?v=' + youtubeurl
source_code = requests.get(myURL)
pageStatus = source_code.status_code
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
pageStatus = str(pageStatus)
pageStatus = pageStatus[:1]
pageStatus = int(pageStatus)
if pageStatus == 2:
pageUrlChange = 0
else:
pageUrlChange = 1
if pageStatus == 3:
pageUrl = source_code.url
l = soup.findAll("meta", attrs={"name": "keywords"})
if l is None:
myKeywords = ""
else:
try:
myKeywords = l[0]['content']
except:
myKeywords = myKeywords
myKeywords = myKeywords.replace(', ', '~')
myKeywords = myKeywords.replace(',', '~')
myKeywords = myKeywords.replace('(', '')
myKeywords = myKeywords.replace(')', '')
if soup.find('title'):
myTitle = soup.find('title').string
if "https://www.youtube.com/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[3]
youtubeurl = re.sub(r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/embed/' + youtubeurl
# print(youtubeurl)
if "https://youtu.be/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[3]
youtubeurl = re.sub(
r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/embed/' + youtubeurl
# print(youtubeurl)
# print((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
return (0, myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
linkLocation = r'C:\Users\JCModern\Desktop\content_links.csv'
source_code = ''
myURL = ''
timestr = time.strftime("%Y%m%d_%H%M%S")
newfilename = r'C:\Users\JCModern\Desktop\content_links_and_keywords_' + timestr + '.csv'
with open(linkLocation, "r", encoding="utf-8-sig") as f:
csv_f = csv.reader(f, delimiter=",")
next(csv_f, None)
pool = multiprocessing.Pool()
with open(newfilename, 'a', newline='') as out, open('errors.txt', 'a', newline='') as err:
writer = csv.writer(out, delimiter=',')
err_writer = csv.writer(err, delimiter=',')
for result in pool.imap_unordered(grabber, ((row[0], row[2]) for row in csv_f if row), chunksize=1):
if result[0]:
writer.writerow(result[1:])
else:
print(result[3])
err_writer.writerow(result[1:3])
pool.close()
pool.join()

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:

The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

Python BeautifulSoup webscraping

I hope sombody can help me with the next issue
I would like to get the data in one row this is what i get now in my csv :
9200000083649863,bol.com retourdeals
9200000083649863,"41,75"
9200000083649863,ITidee
9200000083649863,"45,88"
9200000083649863,Bol.com
9200000083649863,"47,99"
What i would like :
9200000083649863,bol.com retourdeals ,41,75
9200000083649863,ITidee, 45,88
9200000083649863,Bol.com 47,99
this is the code
def haalprijs_verkoper(ean, Urll):
URL = Urll
ean = ean
page = requests.get(URL)
csvfile = open('/home/filoor1/webscrape/book1.csv', 'a')
csvwriter = csv.writer(csvfile)
soup = ""
results = ""
soup = BeautifulSoup(page.text, 'html.parser')
results = soup.find(id='offers')
naam = results.find_all("p, strong")
prijs = results.find_all("span")
# print(results.prettify())
counter = 0
for tag in results.find_all([ 'strong' , 'span']):
# print(tag.text)
aa = tag.text
aa = aa.replace("Nieuw", "")
aa = aa.replace(" ", "")
aa = aa.replace("\n","")
aa = aa.replace("''", "aaaaaa")
aa = aa.strip(' "')
aa = aa.strip('"')
if aa != "":
counter += 0.5
# print(ean, aa, counter)
csvwriter.writerow([ean, aa])
haalprijs_verkoper(9200000083649863, 'https://www.bol.com/nl/prijsoverzicht/tp-link-tl-sg1005p-switch/9200000083649863/?filter=all&sort=price&sortOrder=asc')
Thank you

You can use this example to scrape the data and save the correct CSV:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://www.bol.com/nl/prijsoverzicht/tp-link-tl-sg1005p-switch/9200000083649863/?filter=all&sort=price&sortOrder=asc'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
ean = '9200000083649863'
all_data = []
for s, p in zip(soup.select('p.nosp > strong'),
soup.select('span.product-prices__currency.product-prices__bol-price')):
all_data.append([ean, s.get_text(strip=True), p.get_text(strip=True)])
with open('data.csv', 'w') as f_out:
writer = csv.writer(f_out)
writer.writerows(all_data)
Saves this data.csv:
9200000083649863,bol.com retourdeals,"41,75"
9200000083649863,ITidee,"45,88"
9200000083649863,Bol.com,"47,99"
9200000083649863,4Allshop,"49,70"
9200000083649863,codima,"51,69"
9200000083649863,PlazaSale.nl,"53,40"
9200000083649863,Stock Sellers B.V.,"53,67"
9200000083649863,Art & Craft,"54,27"
9200000083649863,ORM Wholesale,"54,38"
9200000083649863,DutchDo B.V.,"55,92"

Create a python script that will read a csv file and use that input to web scrape the data from finviz.com then export the data into a csv file

I'm trying to pull in a list of stocks from a csv file, upload each stock ticker into finviz.com, and export the data to csv file. I'm new to Python programing but I know this will help me and others. This is what I got so far.
import csv
import urllib.request
from bs4 import BeautifulSoup
with open('shortlist.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
name = None
for row in reader:
if row[0]:
name = row[0]
print(name)
write_header = True
sauce = print(name)
soup = BeautifulSoup(sauce.text, 'html.parser')
print(soup.title.text)
symbols = name
""""
print(symbols)
"""
URL_BASE = "https://finviz.com/quote.ashx?t="
with open('output.csv', 'w', newline='') as file:
writer = csv.writer(file)
for ticker in symbols:
URL = URL_BASE + ticker
try:
fpage = urllib.request.urlopen(URL)
fsoup = BeautifulSoup(fpage, 'html.parser')
if write_header:
# note the change
writer.writerow(['ticker'] + list(map(lambda e: e.text, fsoup.find_all('td', {'class': 'snapshot-td2-cp'}))))
write_header = False
# note the change
writer.writerow([ticker] + list(map(lambda e: e.text, fsoup.find_all('td', {'class': 'snapshot-td2'}))))
except urllib.request.HTTPError:
print("{} - not found".format(URL))
I'm missing the output on the csv file "output.csv". I'm only seeing the data from my input csv file "shortlist". The tie or link is not correctly linked.I've spent a couple of weeks researching/working on how to do this. You're help is greatly appreciated.

import csv
import urllib.request
from bs4 import BeautifulSoup
with open('shortlist.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
name = None
for row in reader:
if row[0]:
name = row[0]
print(name)
write_header = True
#sauce = print(name)
#soup = BeautifulSoup(sauce.text, 'html.parser')
#print(soup.title.text)
symbols = name
""""
print(symbols)
"""
URL_BASE = "https://finviz.com/quote.ashx?t="
with open('output.csv', 'w', newline='') as file:
writer = csv.writer(file)
for ticker in symbols:
URL = URL_BASE + ticker
try:
fpage = urllib.request.urlopen(URL)
fsoup = BeautifulSoup(fpage, 'html.parser')
if write_header:
# note the change
writer.writerow(['ticker'] + list(map(lambda e: e.text, fsoup.find_all('td', {'class': 'snapshot-td2-cp'}))))
write_header = False
# note the change
writer.writerow([ticker] + list(map(lambda e: e.text, fsoup.find_all('td', {'class': 'snapshot-td2'}))))
except urllib.request.HTTPError:
This is the output:
enter image description here

Skip Over Item if element doesn't exist on page

I have a script that loops through multiple pages of a website and I want to skip over or add a blank space for the item that might not be on certain pages. For example, there are some pages that do not contain a description about the book. When I run into one of those pages I get an attribute error. My script below loops through the first two pages with no problem, but when it hits the third page it stops.
Here is the traceback
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/beautifulsoup4-4.6.0-py3.6.egg/bs4/element.py", line 737, in __getattr__ AttributeError: 'NavigableString' object has no attribute 'text'
How can I fix this? Here is my script:
from bs4 import BeautifulSoup as soup
import requests
import json
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
#html parsing
page_soup = soup(response.content, "html5lib")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"LongDescription"})
author = page_soup.select("p")
about = page_soup.find("div",{"id":"AboutBook"})
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find("div",{"class":"twothird"}).h1.text
item['author'] = author[3].get_text(separator=', ')
if item['author'] == " ":
item['author'] = "University of Minnesota Libraries Publishing"
item['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
if not container.find(string="Publisher: "):
item['publisher_url'] = item['publisher'] = ""
else:
item['publisher'] = container.find(text="Publisher: ").nextSibling.text
item['publisher_url'] = container.find(text="Publisher: ").nextSibling['href']
item['source'] = "Open Textbook Library"
if not about.h2.nextSibling.nextSibling.nextSibling:
item['description'] = ""
else:
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text
item['base_url'] = "https://open.umn.edu/opentextbooks/"
if container.find("p",{"class":"Badge-Condition"}).a:
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license'] = ''
if container.find("p",{"class":"Badge-Condition"}).a:
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license_url'] = ''
if container.find("div",{"class":"twothird"}).p:
item['review'] = container.find("div",{"class":"twothird"}).p.text
else:
item['review'] = ''
if item['review'].startswith('('):
item['review'] = item['review'].replace('(', '')
if item['review'].endswith(' reviews)'):
item['review'] = item['review'].replace(' reviews)', '')
if item['review'] > str(0):
item['review'] = "Reviewed Resource"
else:
item['review'] = ''
item['image_url'] = "https://open.umn.edu/opentextbooks/" + container.img["src"]
data.append(item) # add the item to the list
with open("./json/otl-1.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)

I wouldn't recommend parsing the description with item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text, that's too much specific. I came up with this code:
from bs4 import BeautifulSoup as soup
import requests
import json
from pprint import pprint
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
page_soup = soup(response.content, "lxml")
data = {}
title, author, description = page_soup.select('h1')[0].text, \
page_soup.select('h1 ~ p')[3].get_text(', '), \
'\n'.join(p.text.strip() for p in page_soup.select('div#AboutBook > p') if p.text.strip())
data['type'] = "Textbook"
data['title'] = title
data['author'] = author if author.strip() else "University of Minnesota Libraries Publishing"
data['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
data['source'] = "Open Textbook Library"
data['description'] = description
pprint(data)
# with open("./json/otl-1.json", "w") as writeJSON:
# json.dump(data, writeJSON, ensure_ascii=False)
Prints:
{'author': 'University of Minnesota Libraries Publishing',
'description': 'This book is intended for an undergraduate or MBA level '
'Financial Accounting course. It covers the standard topics in '
'a standard sequence, utilizing the Socratic method of asking '
'and answering questions.',
'link': 'https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=4',
'source': 'Open Textbook Library',
'title': 'Financial Accounting',
'type': 'Textbook'}
...and so on (for each book)

Wherever you are getting the AttributeError you can use the following code:
Try:
your code here
except AttributeError:
pass or other codes

How to clear cookies

My code works, but I want to test multiple accounts.
I think I have to clear all the cookie before my program run, but I've tried many things, and they haven't worked.
Here is my code:
import mechanize
from bs4 import BeautifulSoup
import urllib2
import cookielib
import csv
fic_csv = raw_input("name : ")
delimiter = raw_input("délimiter : ")
def test ():
with open(fic_csv+".csv", "r") as csvfile:
readCSV = csv.reader(csvfile, delimiter=':')
pws = []
mails = []
i = 0
while (i<2):
for row in readCSV:
mail = row[0]
pw = row[1]
pws.append(pw)
mails.append(mail)
mail_site = mails[i]
pw_site = pws[i]
cj = cookielib.CookieJar()
br = mechanize.Browser()
br.set_cookiejar(cj)
br.open("url")
br.select_form(nr=0)
br.form['login'] = mail_site
br.form['password'] = pw_site
res = br.submit()
html = res.read() #read and store the result html page
list_srch =["tag1","tag2"]
ii = 0
while (ii < 2):
br2 = mechanize.Browser()
br2.set_cookiejar(cj)
br2.open("url")
br2.select_form(nr=0)
br2.form['sq'] = list_srch[ii]
res2 = br2.submit()
html2 = res2.read() #read and store the result html page
soup = BeautifulSoup(html2, 'lxml')
table1 = soup.findAll("table",{ "width" : "100%" })[13] #
tr1 = table1.findAll('tr')[3]
table2 = tr1.findAll("table",{ "width" : "100%" })[0]
tr2 = table2.findAll('tr')[1]
tr3 = tr2.findAll('td')[5]
resultat = tr3.string
print resultat
fic_dest= str(("%s.csv" % list_srch[ii]))
with open (fic_dest, "w") as fdest:
writer = csv.writer(fdest)
writer.writerow( (mail, pw, resultat) )
print 'over'
ii += 1
i += 1
test()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python beautifulsoup parsing speed improvement - python

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

Python BeautifulSoup webscraping

Create a python script that will read a csv file and use that input to web scrape the data from finviz.com then export the data into a csv file

Skip Over Item if element doesn't exist on page

How to clear cookies

Categories

Resources