Python BeautifulSoup webscraping

Python BeautifulSoup webscraping - python

I hope sombody can help me with the next issue
I would like to get the data in one row this is what i get now in my csv :
9200000083649863,bol.com retourdeals
9200000083649863,"41,75"
9200000083649863,ITidee
9200000083649863,"45,88"
9200000083649863,Bol.com
9200000083649863,"47,99"
What i would like :
9200000083649863,bol.com retourdeals ,41,75
9200000083649863,ITidee, 45,88
9200000083649863,Bol.com 47,99
this is the code
def haalprijs_verkoper(ean, Urll):
URL = Urll
ean = ean
page = requests.get(URL)
csvfile = open('/home/filoor1/webscrape/book1.csv', 'a')
csvwriter = csv.writer(csvfile)
soup = ""
results = ""
soup = BeautifulSoup(page.text, 'html.parser')
results = soup.find(id='offers')
naam = results.find_all("p, strong")
prijs = results.find_all("span")
# print(results.prettify())
counter = 0
for tag in results.find_all([ 'strong' , 'span']):
# print(tag.text)
aa = tag.text
aa = aa.replace("Nieuw", "")
aa = aa.replace(" ", "")
aa = aa.replace("\n","")
aa = aa.replace("''", "aaaaaa")
aa = aa.strip(' "')
aa = aa.strip('"')
if aa != "":
counter += 0.5
# print(ean, aa, counter)
csvwriter.writerow([ean, aa])
haalprijs_verkoper(9200000083649863, 'https://www.bol.com/nl/prijsoverzicht/tp-link-tl-sg1005p-switch/9200000083649863/?filter=all&sort=price&sortOrder=asc')
Thank you

You can use this example to scrape the data and save the correct CSV:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://www.bol.com/nl/prijsoverzicht/tp-link-tl-sg1005p-switch/9200000083649863/?filter=all&sort=price&sortOrder=asc'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
ean = '9200000083649863'
all_data = []
for s, p in zip(soup.select('p.nosp > strong'),
soup.select('span.product-prices__currency.product-prices__bol-price')):
all_data.append([ean, s.get_text(strip=True), p.get_text(strip=True)])
with open('data.csv', 'w') as f_out:
writer = csv.writer(f_out)
writer.writerows(all_data)
Saves this data.csv:
9200000083649863,bol.com retourdeals,"41,75"
9200000083649863,ITidee,"45,88"
9200000083649863,Bol.com,"47,99"
9200000083649863,4Allshop,"49,70"
9200000083649863,codima,"51,69"
9200000083649863,PlazaSale.nl,"53,40"
9200000083649863,Stock Sellers B.V.,"53,67"
9200000083649863,Art & Craft,"54,27"
9200000083649863,ORM Wholesale,"54,38"
9200000083649863,DutchDo B.V.,"55,92"

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:

The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

Looping through multiple pages using BeautifulSoup

I'm quite new to web scraping using BeautifuSoup and I'm attempting to loop through multiple pages and write everything into a CSV file.
Here my current code:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
urls = []
url = "https://www.newegg.com/Black-Friday-Deals/EventSaleStore/ID-10475/"
for page in range (1,7):
pageurl = 'Page-{1}'.format(url,page)
urls.append(pageurl)
page = urlopen(url)
html = page.read().decode("utf-8")
page.close()
page_soup = soup(html, "html.parser")
containers = page_soup.findAll("div",{"class" : "item-container"})
print(len(containers))
filename = "BlackFridayNewegg.csv"
f = open(filename, "w")
headers = "Product, Previous price, Current price"
f.write(headers)
for container in containers:
rating_container = page_soup.findAll("span", {"class" : "item-rating-num"})
rating = rating_container[0].text.strip()
title_container = container.findAll("a",{"class":"item-title"})
title = title_container[0].text.strip()
prev_price_container = container.findAll("li",{"class" : "price-was"})
prev_price = prev_price_container[0].text
current_price_container = container.findAll("li", {"class" : "price-current"})
current_price = current_price_container[0].text
print("Product Name: " +title)
print("Previous Price: " + prev_price)
print("Current Price: "+current_price)
result = title.replace(",","")+ "," + prev_price +"," +current_price +"\n"
f.write(result)
f.close()
My code is working properly and will display text from multiple pages, but it won't write it all into the file. Any reason as to why this is happening?

f = open(filename, "w") clears the file. You need f = open(filename, "a")

How to clear cookies

My code works, but I want to test multiple accounts.
I think I have to clear all the cookie before my program run, but I've tried many things, and they haven't worked.
Here is my code:
import mechanize
from bs4 import BeautifulSoup
import urllib2
import cookielib
import csv
fic_csv = raw_input("name : ")
delimiter = raw_input("délimiter : ")
def test ():
with open(fic_csv+".csv", "r") as csvfile:
readCSV = csv.reader(csvfile, delimiter=':')
pws = []
mails = []
i = 0
while (i<2):
for row in readCSV:
mail = row[0]
pw = row[1]
pws.append(pw)
mails.append(mail)
mail_site = mails[i]
pw_site = pws[i]
cj = cookielib.CookieJar()
br = mechanize.Browser()
br.set_cookiejar(cj)
br.open("url")
br.select_form(nr=0)
br.form['login'] = mail_site
br.form['password'] = pw_site
res = br.submit()
html = res.read() #read and store the result html page
list_srch =["tag1","tag2"]
ii = 0
while (ii < 2):
br2 = mechanize.Browser()
br2.set_cookiejar(cj)
br2.open("url")
br2.select_form(nr=0)
br2.form['sq'] = list_srch[ii]
res2 = br2.submit()
html2 = res2.read() #read and store the result html page
soup = BeautifulSoup(html2, 'lxml')
table1 = soup.findAll("table",{ "width" : "100%" })[13] #
tr1 = table1.findAll('tr')[3]
table2 = tr1.findAll("table",{ "width" : "100%" })[0]
tr2 = table2.findAll('tr')[1]
tr3 = tr2.findAll('td')[5]
resultat = tr3.string
print resultat
fic_dest= str(("%s.csv" % list_srch[ii]))
with open (fic_dest, "w") as fdest:
writer = csv.writer(fdest)
writer.writerow( (mail, pw, resultat) )
print 'over'
ii += 1
i += 1
test()

Python: save same-title files in the folder

Code:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import json
from os import listdir
res = requests.get('http://www.abcde.com/frontend/SearchParts')
soup = BeautifulSoup(res.text,"lxml")
href = [ a["href"] for a in soup.findAll("a", {"id" : re.compile("parts_img.*")})]
b1 =[]
for url in href:
b1.append("http://www.abcde.com"+url)
#print (b1)
b=[]
for i in range(len(b1)):
res2 = requests.get(b1[i]).text
soup2 = BeautifulSoup(res2,"lxml")
url_n=soup2.find('',rel = 'next')['href']
url_n=("http://www.abcde.com"+url_n)
#print(url_n)
b.append(b1[i])
b.append(url_n)
while True:
res3=requests.get(url_n).text
soup3 = BeautifulSoup(res3,"lxml")
try:
url_n=soup3.find('',rel = 'next')['href']
except TypeError:
break
if url_n:
url_n=("http://www.abcde.com"+url_n)
#print(url_n)
b.append(url_n)
all=[]
for url in b:
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".article-title"):
all.append(urljoin('http://www.abcde.com',item['href']))
for urls in all:
re=requests.get(urls)
soup=BeautifulSoup(re.text.encode('utf-8'), "html.parser")
title_tag = soup.select_one('.page_article_title')
list=[]
for tag in soup.select('.page_article_content'):
list.append(tag.text)
list=([c.replace('\n', '') for c in list])
list=([c.replace('\r', '') for c in list])
list=([c.replace('\t', '') for c in list])
list=([c.replace(u'\xa0', u' ') for c in list])
list= (', '.join(list))
fruit_tag = soup.select_one('.authorlink')
fruit_final=None
if fruit_tag:
fruit_final= fruit_tag.text
else:
fruit_final= fruit_tag
keys=soup.findAll('div', style="font-size:1.2em;")
keys_final=None
list2=[]
if keys:
for key in keys:
list2.append(key.text)
list2=([c.replace('\n', '') for c in list2])
list2=([c.replace(' ', '') for c in list2])
list2= (', '.join(list2))
key_final=list2
else:
key_final=keys
if key_final==[]:
key_final=None
##################edit part####################################
data={
"Title" : title_tag.text,
"Registration": fruit_final,
"Keywords": key_final,
"Article": list
}
save_path= "C:/json/"
files=listdir(save_path)
file_name = save_path+'%s.json' % title_tag.text
with open(file_name, 'w',encoding='UTF-8') as f:
if file_name not in files:
file_d = json.dumps(data,ensure_ascii=False)
f.write(file_d)
else:
file_name = save_path +'%s_1.json' % title_tag.text
file_d = json.dumps(data,ensure_ascii=False)
f.write(file_d)
I scraped a web page and extract every article's title as title_tag.text. I found that some articles have same titles but different urls/contents, so I still need to save them in my directory. Now I know how to check it if two titles are the same, I can just name one as original and another with original_1. But what if I need to save 4 files which have same titles? How to do it in this case? Thanks in advance!

parsing table with BeautifulSoup and write in text file

I need data from table in text file (output.txt) in this format:
data1;data2;data3;data4;.....
Celkova podlahova plocha bytu;33m;Vytah;Ano;Nadzemne podlazie;Prizemne podlazie;.....;Forma vlastnictva;Osobne
All in "one line", separator is ";" (later export in csv-file).
I´m beginner.. Help, thanks.
from BeautifulSoup import BeautifulSoup
import urllib2
import codecs
response = urllib2.urlopen('http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever')
html = response.read()
soup = BeautifulSoup(html)
tabulka = soup.find("table", {"class" : "detail-char"})
for row in tabulka.findAll('tr'):
col = row.findAll('td')
prvy = col[0].string.strip()
druhy = col[1].string.strip()
record = ([prvy], [druhy])
fl = codecs.open('output.txt', 'wb', 'utf8')
for rec in record:
line = ''
for val in rec:
line += val + u';'
fl.write(line + u'\r\n')
fl.close()

You are not keeping each record as you read it in. Try this, which stores the records in records:
from BeautifulSoup import BeautifulSoup
import urllib2
import codecs
response = urllib2.urlopen('http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever')
html = response.read()
soup = BeautifulSoup(html)
tabulka = soup.find("table", {"class" : "detail-char"})
records = [] # store all of the records in this list
for row in tabulka.findAll('tr'):
col = row.findAll('td')
prvy = col[0].string.strip()
druhy = col[1].string.strip()
record = '%s;%s' % (prvy, druhy) # store the record with a ';' between prvy and druhy
records.append(record)
fl = codecs.open('output.txt', 'wb', 'utf8')
line = ';'.join(records)
fl.write(line + u'\r\n')
fl.close()
This could be cleaned up more, but I think it's what you are wanting.

here's an alternative non BS way, just for your task
store=[] #to store your results
url="""http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever"""
page=urllib2.urlopen(url)
data=page.read()
for table in data.split("</table>"):
if "<table" in table and 'class="detail-char' in table:
for item in table.split("</td>"):
if "<td" in item:
store.append(item.split(">")[-1].strip())
print ','.join(store)
output
$ ./python.py
Celková podlahová plocha bytu,33 m2,Výťah,Áno,Nadzemné podlažie,Prízemné podlažie,Stav,Čiastočná rekonštrukcia,Konštrukcia bytu,tehlová,Forma vlastníctva,osobné

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python BeautifulSoup webscraping - python

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

Looping through multiple pages using BeautifulSoup

How to clear cookies

Python: save same-title files in the folder

parsing table with BeautifulSoup and write in text file

Categories

Resources