How to clear cookies - python

My code works, but I want to test multiple accounts.
I think I have to clear all the cookie before my program run, but I've tried many things, and they haven't worked.
Here is my code:
import mechanize
from bs4 import BeautifulSoup
import urllib2
import cookielib
import csv
fic_csv = raw_input("name : ")
delimiter = raw_input("délimiter : ")
def test ():
with open(fic_csv+".csv", "r") as csvfile:
readCSV = csv.reader(csvfile, delimiter=':')
pws = []
mails = []
i = 0
while (i<2):
for row in readCSV:
mail = row[0]
pw = row[1]
pws.append(pw)
mails.append(mail)
mail_site = mails[i]
pw_site = pws[i]
cj = cookielib.CookieJar()
br = mechanize.Browser()
br.set_cookiejar(cj)
br.open("url")
br.select_form(nr=0)
br.form['login'] = mail_site
br.form['password'] = pw_site
res = br.submit()
html = res.read() #read and store the result html page
list_srch =["tag1","tag2"]
ii = 0
while (ii < 2):
br2 = mechanize.Browser()
br2.set_cookiejar(cj)
br2.open("url")
br2.select_form(nr=0)
br2.form['sq'] = list_srch[ii]
res2 = br2.submit()
html2 = res2.read() #read and store the result html page
soup = BeautifulSoup(html2, 'lxml')
table1 = soup.findAll("table",{ "width" : "100%" })[13] #
tr1 = table1.findAll('tr')[3]
table2 = tr1.findAll("table",{ "width" : "100%" })[0]
tr2 = table2.findAll('tr')[1]
tr3 = tr2.findAll('td')[5]
resultat = tr3.string
print resultat
fic_dest= str(("%s.csv" % list_srch[ii]))
with open (fic_dest, "w") as fdest:
writer = csv.writer(fdest)
writer.writerow( (mail, pw, resultat) )
print 'over'
ii += 1
i += 1
test()

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:
The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

Python BeautifulSoup webscraping

I hope sombody can help me with the next issue
I would like to get the data in one row this is what i get now in my csv :
9200000083649863,bol.com retourdeals
9200000083649863,"41,75"
9200000083649863,ITidee
9200000083649863,"45,88"
9200000083649863,Bol.com
9200000083649863,"47,99"
What i would like :
9200000083649863,bol.com retourdeals ,41,75
9200000083649863,ITidee, 45,88
9200000083649863,Bol.com 47,99
this is the code
def haalprijs_verkoper(ean, Urll):
URL = Urll
ean = ean
page = requests.get(URL)
csvfile = open('/home/filoor1/webscrape/book1.csv', 'a')
csvwriter = csv.writer(csvfile)
soup = ""
results = ""
soup = BeautifulSoup(page.text, 'html.parser')
results = soup.find(id='offers')
naam = results.find_all("p, strong")
prijs = results.find_all("span")
# print(results.prettify())
counter = 0
for tag in results.find_all([ 'strong' , 'span']):
# print(tag.text)
aa = tag.text
aa = aa.replace("Nieuw", "")
aa = aa.replace(" ", "")
aa = aa.replace("\n","")
aa = aa.replace("''", "aaaaaa")
aa = aa.strip(' "')
aa = aa.strip('"')
if aa != "":
counter += 0.5
# print(ean, aa, counter)
csvwriter.writerow([ean, aa])
haalprijs_verkoper(9200000083649863, 'https://www.bol.com/nl/prijsoverzicht/tp-link-tl-sg1005p-switch/9200000083649863/?filter=all&sort=price&sortOrder=asc')
Thank you
You can use this example to scrape the data and save the correct CSV:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://www.bol.com/nl/prijsoverzicht/tp-link-tl-sg1005p-switch/9200000083649863/?filter=all&sort=price&sortOrder=asc'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
ean = '9200000083649863'
all_data = []
for s, p in zip(soup.select('p.nosp > strong'),
soup.select('span.product-prices__currency.product-prices__bol-price')):
all_data.append([ean, s.get_text(strip=True), p.get_text(strip=True)])
with open('data.csv', 'w') as f_out:
writer = csv.writer(f_out)
writer.writerows(all_data)
Saves this data.csv:
9200000083649863,bol.com retourdeals,"41,75"
9200000083649863,ITidee,"45,88"
9200000083649863,Bol.com,"47,99"
9200000083649863,4Allshop,"49,70"
9200000083649863,codima,"51,69"
9200000083649863,PlazaSale.nl,"53,40"
9200000083649863,Stock Sellers B.V.,"53,67"
9200000083649863,Art & Craft,"54,27"
9200000083649863,ORM Wholesale,"54,38"
9200000083649863,DutchDo B.V.,"55,92"

How to make request to new url?

I already have this code, helped by a friend before. I already get all the links in the site. I want to get the name, merk, price, picture, description of the product, and the link of the product. The description's product only appear if we click the product.
I'm a beginner in Python.
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
merk = []
price = []
pic = []
description = []
while (response.url == expected_url):
#print("GET {0}".format(expected_url))
soup = BeautifulSoup(response.read(), "html.parser")
products = soup.find("div",{"id":"product-list-grid"})
for i in products:
data = products.findAll("div",{"class":"product-item"})
for j in range(0, len(data)):
link.append(data[j]["data-eec-href"])
count += 1
expected_url = url % count
response = get_url(expected_url)
print(len(link))
"""
import csv
dataset=zip(link, merk, name, pic, price, description)
with open("foundation_sociolla.csv","w", newline='') as csvfile:
writer=csv.writer(csvfile)
header=['link', 'merk', 'name', 'pic', 'price', 'description']
writer.writerow(header)
writer.writerows(dataset)
"""
You need to make a request to the URL. Parse the content of that request and extract the data you want.
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
make = []
price = []
pic = []
description = []
while response.url == expected_url:
soup = BeautifulSoup(response.read(), "html.parser")
for product in soup.select("div.product-item"):
product_url = (product['data-eec-href'])
link.append(product_url)
product_response = get_url(product_url)
product_soup = BeautifulSoup(product_response.read(), "html.parser")
product_pic = product_soup.select('img#bigpic')[0]['src']
pic.append(product_pic)
product_price = product_soup.select('span#our_price_display')[0].text.strip()
price.append(product_price)
product_name = product_soup.select('div.detail-product-logo p')[0].text.strip()
name.append(product_name)
product_make = product_soup.select('div.detail-product-logo h3')[0].text.strip()
make.append(product_make)
product_description = product_soup.select('div#Details article')[0].text.strip()
description.append(product_description)
print(product_url, product_pic, product_price, product_name, product_make, product_description)
count += 1
expected_url = url % count
response = get_url(expected_url)
But if your going to scrape a lot of pages you are much better off using something like Scrapy https://scrapy.org/

Python beautifulsoup parsing speed improvement

Currently I have written my first python script in order to loop through some URL's listed in a CSV. over 14,000 links. I am trying to 1) get all the keyword tags 2) check page status (404 links need to get flagged). 3) convert youtube videos into the embed youtube link ( after maybe going to the webpage getting the keywords and then converting into the embed link )
It is going so slow but I can not figure out a faster way. I feel like it is the request.get() attribute but I dont know how i can speed it up. I only need the meta data but is their a way to only get the beginning of the page and not all of it? How do i make this code better / faster / optimized.
Also when compiling using pyinstaller I receive a collections problem. I feel like I am using python 2 code in python 3.. as i am writing using python 3.5
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
linkLocation = r'C:\Users\JCModern\Desktop\content_links.csv'
source_code = ''
myURL = ''
timestr = time.strftime("%Y%m%d_%H%M%S")
newfilename = r'C:\Users\JCModern\Desktop\content_links_and_keywords_' + timestr + '.csv'
with open(newfilename, "w", newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(('cmsid', 'filepath', 'metatags', 'pageurl', 'pageurlchange'))
file.close()
with open(linkLocation, "r", encoding="utf-8-sig") as f:
csv_f = csv.reader(f, delimiter=",")
next(csv_f, None)
for row in csv_f:
if len(row) != 0:
# init variables
myKeywords = ""
myTitle = ''
myURL = ''
pageUrlChange = ''
pageStatus = ''
pageUrl = ''
myCmsid = (row[0])
myURL = (row[2])
if "https://www.youtube.com/embed/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[4]
youtubeurl = re.sub(
r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/watch?v=' + youtubeurl
try:
source_code = requests.get(myURL)
except Exception:
with open('errors.txt', 'a', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow((myCmsid, myURL))
file.close()
pageStatus = source_code.status_code
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
pageStatus = str(pageStatus)
pageStatus = pageStatus[:1]
pageStatus = int(pageStatus)
if pageStatus == 2:
pageUrlChange = 0
else:
pageUrlChange = 1
if pageStatus == 3:
pageUrl = source_code.url
l = soup.findAll("meta", attrs={"name": "keywords"})
if l is None:
myKeywords = ""
else:
try:
myKeywords = l[0]['content']
except:
myKeywords = myKeywords
myKeywords = myKeywords.replace(', ', '~')
myKeywords = myKeywords.replace(',', '~')
myKeywords = myKeywords.replace('(', '')
myKeywords = myKeywords.replace(')', '')
if soup.find('title'):
myTitle = soup.find('title').string
if "https://www.youtube.com/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[3]
youtubeurl = re.sub(r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/embed/' + youtubeurl
# print(youtubeurl)
if "https://youtu.be/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[3]
youtubeurl = re.sub(
r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/embed/' + youtubeurl
# print(youtubeurl)
# print((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
with open(newfilename, "a", newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
file.close()
f.close()
html.parser is a pure-python implementation using regular expressions. You really don't want to use it. Install lxml and have the parsing done in C code instead (do remember to then use BeautifulSoup(plain_text, 'lxml').
You also don't want to keep re-opening your CSV file. Open it once, outside your loop, and simply write new rows to the csv.writer() object in your loop.
You can't otherwise speed up URL loading, not much. Network speed is always going to be a bottleneck. You could use the very low-level PyCurl library, but I doubt the speedups it can offer are going to have an impact here.
In addition to the excellent suggestion to move to a faster xml parser, this is a good candidate for parallelization via the multiprocessing module. I've rearranged your code to do the request/parsing in a worker that can be delegated to a subprocess. The worker returns the row needed to be added to the csv. I added a 0/-1 error code to the front of the returned row so the parent process knows which csv gets the result.
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
import multiprocessing
import traceback
def grabber(myCmsid, myURL):
try:
return grabber_impl(myCmsid, myURL)
except:
return (-1, myCmsid, myURL, traceback.format_exc())
def grabber_impl(myCmsid, myURL):
# init variables
myKeywords = ""
myTitle = ''
myURL = ''
pageUrlChange = ''
pageStatus = ''
pageUrl = ''
if "https://www.youtube.com/embed/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[4]
youtubeurl = re.sub(
r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/watch?v=' + youtubeurl
source_code = requests.get(myURL)
pageStatus = source_code.status_code
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
pageStatus = str(pageStatus)
pageStatus = pageStatus[:1]
pageStatus = int(pageStatus)
if pageStatus == 2:
pageUrlChange = 0
else:
pageUrlChange = 1
if pageStatus == 3:
pageUrl = source_code.url
l = soup.findAll("meta", attrs={"name": "keywords"})
if l is None:
myKeywords = ""
else:
try:
myKeywords = l[0]['content']
except:
myKeywords = myKeywords
myKeywords = myKeywords.replace(', ', '~')
myKeywords = myKeywords.replace(',', '~')
myKeywords = myKeywords.replace('(', '')
myKeywords = myKeywords.replace(')', '')
if soup.find('title'):
myTitle = soup.find('title').string
if "https://www.youtube.com/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[3]
youtubeurl = re.sub(r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/embed/' + youtubeurl
# print(youtubeurl)
if "https://youtu.be/" in myURL:
youtubeurl = myURL.split('/')
youtubeurl = youtubeurl[3]
youtubeurl = re.sub(
r'\?|\&|\=|re(l=\d+|l)|featur(e=sub|e)|playnext|video(s=\w+|s)|watch|_?((youtu(\.be|be))|fro(m=TL|m)|gdata|player|lis(t=\w+|t)|(inde(x=\w+|x)))_?|(v|vi)=|channel|ytscreeningroom','', youtubeurl)
myURL = 'https://www.youtube.com/embed/' + youtubeurl
# print(youtubeurl)
# print((myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
return (0, myCmsid, myURL, myKeywords, pageUrl, pageUrlChange))
linkLocation = r'C:\Users\JCModern\Desktop\content_links.csv'
source_code = ''
myURL = ''
timestr = time.strftime("%Y%m%d_%H%M%S")
newfilename = r'C:\Users\JCModern\Desktop\content_links_and_keywords_' + timestr + '.csv'
with open(linkLocation, "r", encoding="utf-8-sig") as f:
csv_f = csv.reader(f, delimiter=",")
next(csv_f, None)
pool = multiprocessing.Pool()
with open(newfilename, 'a', newline='') as out, open('errors.txt', 'a', newline='') as err:
writer = csv.writer(out, delimiter=',')
err_writer = csv.writer(err, delimiter=',')
for result in pool.imap_unordered(grabber, ((row[0], row[2]) for row in csv_f if row), chunksize=1):
if result[0]:
writer.writerow(result[1:])
else:
print(result[3])
err_writer.writerow(result[1:3])
pool.close()
pool.join()

Logic flow - trying to iterate through website pages with BeautifulSoup and CSV Writer

I can't seem to figure out the proper indents/clause placements to get this to loop through more than 1 page. This code current prints out a CSV file fine, but only does it for the first page.
#THIS WORKS BUT ONLY PRINTS THE FIRST PAGE
from bs4 import BeautifulSoup
from urllib2 import urlopen
import csv
page_num = 1
total_pages = 20
with open("MegaMillions.tsv","w") as f:
fieldnames = ['date', 'numbers', 'moneyball']
writer = csv.writer(f, delimiter = '\t')
writer.writerow(fieldnames)
while page_num < total_pages:
page_num = str(page_num)
soup = BeautifulSoup(urlopen('http://www.usamega.com/mega-millions-history.asp?p='+page_num).read())
for row in soup('table',{'bgcolor':'white'})[0].findAll('tr'):
tds = row('td')
if tds[1].a is not None:
date = tds[1].a.string.encode("utf-8")
if tds[3].b is not None:
uglynumber = tds[3].b.string.split()
betternumber = [int(uglynumber[i]) for i in range(len(uglynumber)) if i%2==0]
moneyball = tds[3].strong.string.encode("utf-8")
writer.writerow([date, betternumber, moneyball])
page_num = int(page_num)
page_num += 1
print 'We\'re done here.'
And of course, this only prints the last page:
#THIS WORKS BUT ONLY PRINTS THE LAST PAGE
from bs4 import BeautifulSoup
from urllib2 import urlopen
import csv
page_num = 1
total_pages = 20
while page_num < total_pages:
page_num = str(page_num)
soup = BeautifulSoup(urlopen('http://www.usamega.com/mega-millions-history.asp?p='+page_num).read())
with open("MegaMillions.tsv","w") as f:
fieldnames = ['date', 'numbers', 'moneyball']
writer = csv.writer(f, delimiter = '\t')
writer.writerow(fieldnames)
for row in soup('table',{'bgcolor':'white'})[0].findAll('tr'):
tds = row('td')
if tds[1].a is not None:
date = tds[1].a.string.encode("utf-8")
if tds[3].b is not None:
uglynumber = tds[3].b.string.split()
betternumber = [int(uglynumber[i]) for i in range(len(uglynumber)) if i%2==0]
moneyball = tds[3].strong.string.encode("utf-8")
writer.writerow([date, betternumber, moneyball])
page_num = int(page_num)
page_num += 1
print 'We\'re done here.'
The issue with your second code example is that you're overwriting your file each time. Instead of
open("MegaMillions.tsv","w")
use
open("MegaMillions.tsv","a")
The "a" opens the file for appending, which is what you want to do
Thanks to the suggestions, here's one variation that works:
from bs4 import BeautifulSoup
from urllib2 import urlopen
import csv
page_num = 1
total_pages = 73
with open("MegaMillions.tsv","w") as f:
fieldnames = ['date', 'numbers', 'moneyball']
writer = csv.writer(f, delimiter = '\t')
writer.writerow(fieldnames)
while page_num <= total_pages:
page_num = str(page_num)
soup = BeautifulSoup(urlopen('http://www.usamega.com/mega-millions-history.asp?p='+page_num).read())
for row in soup('table',{'bgcolor':'white'})[0].findAll('tr'):
tds = row('td')
if tds[1].a is not None:
date = tds[1].a.string.encode("utf-8")
if tds[3].b is not None:
uglynumber = tds[3].b.string.split()
betternumber = [int(uglynumber[i]) for i in range(len(uglynumber)) if i%2==0]
moneyball = tds[3].strong.string.encode("utf-8")
writer.writerow([date, betternumber, moneyball])
page_num = int(page_num)
page_num += 1
print 'We\'re done here.'
opted for this over the 'a' because then the headerrow gets written for every page.

Categories