I'm pretty new to web scraping but enjoying it so far so thought I'd test myself!
I've written this query to scrape this website but just wondering is there a way of making it more efficient? At the moment, I've had to set the max page to 87 as this is the last page that guitars appear on. However, amps only have 15 pages of results but I'm still looping through 87. Any ideas appreciated!
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 88
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/','guitars/bass/','amps/','guitars/acoustic/','pedals/']:
for x in range(1,n):
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(x)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price' : prices[index],
'avail' : avails[index]
})
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Thanks
Try the following approach:
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
page_number = 1
while True:
url = f"https://www.guitarguitar.co.uk/{category}page-{page_number}"
print(url)
page_number += 1
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
for div_product in soup.find_all('div', class_="product-inner"):
product = div_product.find('h3', {'class': 'qa-product-list-item-title'}).get_text(strip=True)
price = div_product.find('span', {'class': 'js-pounds'}).get_text(strip=True)
avail = div_product.find('div', {'class': 'availability'}).get_text(strip=True)
guitar_products.append({'product' : product, 'price' : price, 'avail' : avail})
# Is there a next button?
if not soup.find('a', class_="next-page-button"):
print("No more")
break
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Improvements:
This looks for the Next button on each page to then skip to the next category.
It locates the <div> holding each product and then uses a single find to get each product detail. This avoids the need to build multiple lists and then join them.
Build the URL using a Python f string.
You can check H1:
*soup = BeautifulSoup(page.content, 'html.parser')*
if soup.find('h1').contents[0] == 'Page Not Found':
break
or change circle from for to while:
is_page = True
x = 0
while is_page:
x = x + 1
. . .
if soup.find('h1').contents[0] == 'Page Not Found':
is_page = False
break
This is probably not the most elegant solution, but it is functional and straightforward. An infinite loop which ends if no product is found.
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 1
# ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
while True:
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(n)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price': prices[index],
'avail': avails[index]
})
if len(products) == 0:
n = 1
break
else:
n += 1
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
I am a marketer and want to conduct some basic market research using Python.
I wrote a simple coding to crawl multiple pages of title, but it does not work to put the title text in the list and to transfer it into Excel format. How can I do in this case?
I tried to create a list and used the extend() method to put these looped titles on the list, but it did not work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
def content_get(url):
count = 0
while count < 4: #this case was to crawl titles of 4 pages
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
titles = soup.find(id="main-container").find_all("div", class_="r-ent")
for title in titles:
print([title.find('div', class_='title').text])
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
firstpage = "https://www.ptt.cc/bbs/movie/index9002.html"
content_get(firstpage)
You need to add the titles to a list outside of the while loop:
def content_get(url):
count = 0
titles = []
while count < 4:
r = requests.get(url)
soup = BeautifulSoup(r.text)
title_page = [title.text.replace('\n', '') for title in soup.find_all('div', {'class': 'title'})]
titles.extend(title_page)
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
return titles
If you don't want the list comprehension to get titles_page, that can be replaced with a traditional for loop:
titles_page = []
titles = soup.find_all('div', {'class': 'title'})
for title in titles:
titles_page.append(title.text.replace('\n', ''))
For the excel file:
def to_excel(text):
df = pd.DataFrame(text, columns=['Title'])
return df.to_excel('output.xlsx')
I am new to BS4 and python.
For a project i am trying to get some real estate data.
i made my code so that is get two lists.
my challege is to combine te data in the output.
can any one help me please?
ty
ps: any tips on more efficiënt code are welkom.
from selenium import webdriver
from bs4 import BeautifulSoup
#open('output.csv', 'w').close()
import re
import time
import requests
from itertools import chain
from pandas import DataFrame
import csv
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(5)
#input('Press Enter after bypassing Captcha')
#print(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
#print(inside)
for huis in info:
#locatie = huis.find('div')
#locatie = ' '.join(locatie.get_text(separator='\r\n', strip=True).split()[:-1])
#locatie = huis.find('h2')
#locatie = ' '.join(locatie.get_text(separator='\r\n', strip=True).split())
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
#sep by newline, strip whitespace, then split to get the last 3 elements to cut out, then rejoin
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
l1 = ('{},{},{},{}'.format(street, address, price, pricetag))
#print('{},{},{},{}'.format(street, address, price, pricetag))
out = open('output.csv', 'w')
saveFile = open('output.csv', 'a')
saveFile.write(street + "," + address + "," + price + "," + pricetag + '\n')
#print (list1)
for items in inside:
href = items.get('href')
#print (href)
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
#print (tr)
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
#print('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
saveFile = open('output.csv', 'a')
saveFile.write(soort+ "," + bouwjaar+ "," + woonoppervlakte + "," + inhoud + "," + perceel + '\n')
saveFile.close()
#output = list(chain(list1,list2))
#print (output)
page += 1
#output = list(chain(list1,list2))
#print (output)
#kenmerken = inside.find_all ('a', {'class': 'href'})
#print (href)
#print (details)
#print('{},{},{},{}'.format(street, address, price, pricetag))
#saveFile = open('jaap.csv', 'a')
#saveFile.write(street + "," + address + "," + price + "," + pricetag + '\n')
#saveFile.close()
jaap_spider(1)
Right now your code doesn't actually seem to make two lists. But asuming that you would make a list of lists for l1 out of for huis in info: and a list of lists l2 from for items in inside:, what you could do to combine two lists of lists is: outputlist = [a + b for a, b in zip(l1, l2)].
I incorporated that, plus a conversion to a Pandas DataFrame and an export to csv in the adapted code below:
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
#browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser = webdriver.Chrome(r'C:\Users\NLNIEH\.spyder-py3\chromedriver.exe')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(5)
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
jaap_spider(1)
You can use csv for writing list in csv file.
import csv
def write_list_in_file(filepath, output):
with open(filepath, 'a') as outtsv:
tuple_writer = csv.writer(outtsv, delimiter=',')
tuple_writer.writerow(output)
Is there a way to count the number of results crawled in BeautifulSoup?
Here is the code.
def crawl_first_url(max_page):
page = 1
while page <= max_page:
url = 'http://www.hdwallpapers.in/page/' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for div in soup.select('.thumb a'):
href = 'http://www.hdwallpapers.in' + div.get('href')
crawl_second_url(href)
page += 1
def crawl_second_url(second_href):
#need to count the number of results here.
#I tried, len(second_href) but it doesn't work well.
crawl_first_url(1)
I want the second function to count the number of crawled results, for exemple if 19 urls have been crawled I want it the amount.
Since you only want to count the number of results, I don't see a reason to have a separate function, just add a counter.
page = 1
numResults = 0
while page <= max_page:
url = 'http://www.hdwallpapers.in/page/' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for div in soup.select('.thumb a'):
href = 'http://www.hdwallpapers.in' + div.get('href')
numResults += 1
page += 1
print("There are " + numResults + " results.")
This will only count the number of subpages. If you also want to count the top level pages just add another increment line after the soup. You might also want to add a try: except: block to avoid crashes.
I'm wondering how to crawl multiple different pages/cities from one website using beautiful soup/requests without having to repeat my code over and over.
Here is my code right now:
Region = "Marrakech"
Spider = 20
def trade_spider(max_pages):
page = -1
partner_ID = 2
location_ID = 25
already_printed = set()
while page <= max_pages:
page += 1
response = urllib.request.urlopen("http://www.jsox.com/s/search.json?q=" + str(Region) +"&page=" + str(page))
jsondata = json.loads(response.read().decode("utf-8"))
format = (jsondata['activities'])
g_data = format.strip("'<>()[]\"` ").replace('\'', '\"')
soup = BeautifulSoup(g_data)
hallo = soup.find_all("article", {"class": "activity-card"})
for item in hallo:
headers = item.find_all("h3", {"class": "activity-card"})
for header in headers:
header_final = header.text.strip()
if header_final not in already_printed:
already_printed.add(header_final)
deeplinks = item.find_all("a", {"class": "activity"})
for t in set(t.get("href") for t in deeplinks):
deeplink_final = t
if deeplink_final not in already_printed:
already_printed.add(deeplink_final)
end_final = "Header: " + header_final + " | " + "Deeplink: " + deeplink_final
print(end_final)
trade_spider(int(Spider))
My goal is to ideally crawl multiple cities/regions from one particular website.
Now, I can do this manually by repeating my code over and over and crawling each individual website and then concatenating my results for each of these dataframes together but that seems very unpythonic. I was wondering if anyone had a faster way or any advice?
I tried to add a second city into my region tag, but does not work
Region = "Marrakech","London"
Can anyone help me with that? Any feedback is appreciated.
Region = ["Marrakech","London"]
Put your while loop inside the for loop, then reset pages to -1.
for reg in Region:
pages = -1
and replace Region with reg while requesting url.
Region = ["Marrakech","London"]
Spider = 20
def trade_spider(max_pages):
partner_ID = 2
location_ID = 25
already_printed = set()
for reg in Region:
page = -1
while page <= max_pages:
page += 1
response = urllib.request.urlopen("http://www.jsox.com/s/search.json?q=" + str(reg) +"&page=" + str(page))
jsondata = json.loads(response.read().decode("utf-8"))
format = (jsondata['activities'])
g_data = format.strip("'<>()[]\"` ").replace('\'', '\"')
soup = BeautifulSoup(g_data)
hallo = soup.find_all("article", {"class": "activity-card"})
for item in hallo:
headers = item.find_all("h3", {"class": "activity-card"})
for header in headers:
header_final = header.text.strip()
if header_final not in already_printed:
already_printed.add(header_final)
deeplinks = item.find_all("a", {"class": "activity"})
for t in set(t.get("href") for t in deeplinks):
deeplink_final = t
if deeplink_final not in already_printed:
already_printed.add(deeplink_final)
end_final = "Header: " + header_final + " | " + "Deeplink: " + deeplink_final
print(end_final)
trade_spider(int(Spider))