I have coded a web scraper for auto trader but for some reason when iterating through urls I can only ever get a maximum length of 1300 for my dataframe. There are 13 results per page so is there some sort of significance about a limit of 100 or am I just doing something wrong? Any help would be greatly appreciated :)
I've attached my code below
# Import required libraries
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# List of urls
path = 'https://www.autotrader.co.uk/car-search?advertClassification=standard&postcode=RH104JJ&make=&price-from=500&price-to=100000&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&advertising-location=at_cars&is-quick-search=TRUE&page='
urls = []
for i in range(1,500):
url = path + str(i)
urls.append(url)
# Lists to store the scraped data in
makes = []
prices = []
ratings = []
dates = []
types = []
miles = []
litres = []
bhps = []
transmissions = []
fuels = []
owners = []
attributes = [makes, ratings, dates, types, miles, litres, bhps, transmissions, fuels, owners]
# Iterate through urls
sum = 0
for url in urls:
sum += 1
if sum%10 == 0:
print(sum)
# Attempt to connect to the url
try:
response = get(url)
except:
print('oops')
html_soup = BeautifulSoup(response.text, 'html.parser')
# Get a list of individual cars and iterate through it
car_containers = html_soup.find_all('li', class_ = 'search-page__result')
for container in car_containers:
try:
container.find("div", {"class": "js-tooltip"}).find("div", {"class": "pi-indicator js-tooltip-trigger"}).text
rating = container.find("div", {"class": "js-tooltip"}).find("div", {"class": "pi-indicator js-tooltip-trigger"}).text.strip()
except:
rating = ''
ratings.append(rating)
make = container.h2.text.strip().title().split(' ')[0]
makes.append(make)
price = container.find("div", {"class": "vehicle-price"}).text[1:]
prices.append(price)
specs = container.find("ul", {"class": "listing-key-specs"}).find_all("li", recursive=True)
for spec in specs:
if spec.text.split(' ')[0].isdigit() and len(spec.text.split(' ')[0]) == 4:
date = spec.text.split(' ')[0]
dates.append(date)
if 'mile' in str(spec):
mile = spec.text.split(' ')[0]
miles.append(mile)
if 'l' in str(spec).lower() and str(spec.text)[:-1].replace('.', '').isnumeric() and not spec.text.split(' ')[0].isdigit():
litre = spec.text[:-1]
litres.append(litre)
if any(x in str(spec).lower() for x in ['automatic', 'manual']):
transmission = spec.text
transmissions.append(transmission)
if any(x in str(spec).lower() for x in ['bhp', 'ps']):
bhp = spec.text
bhps.append(bhp)
if any(x in str(spec).lower() for x in ['petrol', 'diesel']):
fuel = spec.text
fuels.append(fuel)
if 'owner' in str(spec):
owner = spec.text
owners.append(owner.split(' ')[0])
typelist = ['hatchback', 'saloon', 'convertible', 'coupe', 'suv', 'mpv', 'estate', 'limousine',
'pickup']
if any(x in str(spec).lower() for x in typelist):
typ = spec.text
types.append(typ)
# Filling in empty spaces
for attribute in attributes:
if len(attribute) < len(prices):
attribute.append('')
# Creating a dataframe from the lists
df = ({'makes': makes,
'Price': prices,
'Rating': ratings,
'Year': dates,
'Type': types,
'Miles': miles,
'Litres': litres,
'BHP': bhps,
'Transmission': transmissions,
'Fuel': fuels,
'Owners': owners
})
df = pd.DataFrame(df)
Maybe just use a url shortener if the length of the url is too long
Related
I'm pretty new to web scraping but enjoying it so far so thought I'd test myself!
I've written this query to scrape this website but just wondering is there a way of making it more efficient? At the moment, I've had to set the max page to 87 as this is the last page that guitars appear on. However, amps only have 15 pages of results but I'm still looping through 87. Any ideas appreciated!
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 88
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/','guitars/bass/','amps/','guitars/acoustic/','pedals/']:
for x in range(1,n):
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(x)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price' : prices[index],
'avail' : avails[index]
})
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Thanks
Try the following approach:
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
page_number = 1
while True:
url = f"https://www.guitarguitar.co.uk/{category}page-{page_number}"
print(url)
page_number += 1
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
for div_product in soup.find_all('div', class_="product-inner"):
product = div_product.find('h3', {'class': 'qa-product-list-item-title'}).get_text(strip=True)
price = div_product.find('span', {'class': 'js-pounds'}).get_text(strip=True)
avail = div_product.find('div', {'class': 'availability'}).get_text(strip=True)
guitar_products.append({'product' : product, 'price' : price, 'avail' : avail})
# Is there a next button?
if not soup.find('a', class_="next-page-button"):
print("No more")
break
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Improvements:
This looks for the Next button on each page to then skip to the next category.
It locates the <div> holding each product and then uses a single find to get each product detail. This avoids the need to build multiple lists and then join them.
Build the URL using a Python f string.
You can check H1:
*soup = BeautifulSoup(page.content, 'html.parser')*
if soup.find('h1').contents[0] == 'Page Not Found':
break
or change circle from for to while:
is_page = True
x = 0
while is_page:
x = x + 1
. . .
if soup.find('h1').contents[0] == 'Page Not Found':
is_page = False
break
This is probably not the most elegant solution, but it is functional and straightforward. An infinite loop which ends if no product is found.
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 1
# ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
while True:
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(n)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price': prices[index],
'avail': avails[index]
})
if len(products) == 0:
n = 1
break
else:
n += 1
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
I am trying to fetch data from 7000 URLs and save the scraped info into csv. Rather then go through all the 7000 URLs once. how can I break the csv into let say 1000 URLs per csv.
Below is an example of my current code. I have change the total to index 7000 = 10 and per csv = 2 url.
urls = ['www.1.com', 'www.2.com', 'www.3.com', 'www.4.com', 'www.5.com', 'www.6.com', 'www.7.com', 'www.8.com', 'www.9.com', 'www.10.com']
ranks = []
names = []
prices = []
count = 0
rows_count = 0
total_index = 10
i = 1
while i < total_index:
for url in urls[rows_count+0:rows_count+2]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
count += 1
print('Loop', count, f'started for {url}')
rank = []
name = []
price = []
# loop for watchlist
for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz'):
item = item.text
rank.append(item)
ranks.append(rank)
# loop for ticker name
for ticker in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1'):
ticker = ticker.text
name.append(ticker)
names.append(name)
# loop for price
for price_tag in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle'):
price_tag = price_tag.text
price.append(price_tag)
prices.append(price)
sleep_interval = randint(1, 2)
print('Sleep interval ', sleep_interval)
time.sleep(sleep_interval)
rows_count += 2
df = pd.DataFrame(ranks)
df2 = pd.DataFrame(names)
df3 = pd.DataFrame(prices)
final_table = pd.concat([df, df2, df3], axis=1)
final_table.columns=['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
final_table.to_csv(os.path.join(path,fr'summary_{rows_count}.csv'))
i += 2
Seek senior assistant for my problem.
Or is there any other way to do it.
As I understand it you are getting one row of data from scraping each URL. A generic solution for scraping in chunks and writing to CSVs would look something like this:
def scrape_in_chunks(urls, scrape, chunk_size, filename_template):
""" Apply a scraping function to a list of URLs and save as a series of CSVs with data from
one URL on each row and chunk_size urls in each CSV file.
"""
for i in range(0, len(urls), chunk_size):
df = pd.DataFrame([scrape(url) for url in urls[i:i+chunk_size]])
df.to_csv(filename_template.format(start=i, end=i+chunk_size-1))
def my_scraper(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
print('Loop', count, f'started for {url}')
keys = ['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
data = ([item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz')] +
[item.text for item in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1')] +
[item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle')])
return dict(zip(keys, data)) # You could alternatively return a dataframe or series here but dict seems simpler
scrape_in_chunks(urls, my_scraper, 1000, os.path.join(path, "summary {start}-{end}.csv"))
I am trying to scrape multiple pages from multiple URLS efficiently. I have been able to scrape multiple pages from one URL successfully, but unable to implement this for multiple URLs. Any and help would be greatly appreciated. Thank you.
Current Loop Code:
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
with requests.Session() as session:
while True:
(r := session.get(f'{URL}{page+1}')).raise_for_status()
m = re.search('.*page=(\d+)$', r.url)
if m and int(m.group(1)) == page:
break
page += 1
print(f'Scrapping page {page}')
Desired URL Loop:
The only thing being changed for each url is the 1-r, 2-r, 3-r section. The total number of URLS is 5.
URL = [f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
]
Full Code:
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import pandas as pd
import re
import csv
today = datetime.today().strftime('%y%m%d ')
def main():
page = 0
name = []
date = []
address = []
district = []
city = []
price = []
area_sqm = []
rooms = []
floor = []
commission_year = []
building_floors = []
garage = []
balcony = []
windows = []
window_type = []
floor_type = []
door_type = []
leasing = []
description = []
link = []
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
with requests.Session() as session:
while True:
(r := session.get(f'{URL}{page+1}')).raise_for_status()
m = re.search('.*page=(\d+)$', r.url)
if m and int(m.group(1)) == page:
break
page += 1
print(f'Scrapping page {page}')
soup = BS(r.text, 'lxml')
for tag in soup.findAll('div', class_='list-announcement-block'):
_name = tag.find('a', attrs={'itemprop': 'name'})
name.append(_name.get('content', 'N/A'))
if (_link := _name.get('href', None)):
link.append(f'{BASE}{_link}')
(_r := session.get(link[-1])).raise_for_status()
_spanlist = BS(_r.text, 'lxml').find_all('span', class_='value-chars')
floor_type.append(_spanlist[0].get_text().strip())
balcony.append(_spanlist[1].get_text().strip())
garage.append(_spanlist[2].get_text().strip())
window_type.append(_spanlist[3].get_text().strip())
door_type.append(_spanlist[4].get_text().strip())
windows.append(_spanlist[5].get_text().strip())
_alist = BS(_r.text, 'lxml').find_all('a', class_='value-chars')
commission_year.append(_alist[0].get_text().strip())
building_floors.append(_alist[1].get_text().strip())
area_sqm.append(_alist[2].get_text().strip())
floor.append(_alist[3].get_text().strip())
leasing.append(_alist[4].get_text().strip())
district.append(_alist[5].get_text().strip())
address.append(_alist[6].get_text().strip())
rooms.append(tag.find('div', attrs={'announcement-block__breadcrumbs'}).get_text().split('ยป')[1].strip())
description.append(tag.find('div', class_='announcement-block__description').get_text().strip())
date.append(tag.find('div', class_='announcement-block__date').get_text().split(',')[0].strip())
city.append((tag.find('meta', attrs={'itemprop': 'areaServed'})).get('content'))
if (_price := tag.find('meta', attrs={'itemprop': 'price'})) is None:
_price = tag.find('div', class_='announcement-block__price _premium')
price.append(_price.get_text().strip() if _price else 'N/A')
df = pd.DataFrame(zip(name, date, address, district, city,
price, area_sqm, rooms, floor, commission_year,
building_floors, garage, balcony, windows, window_type,
floor_type, door_type, leasing, description, link), columns=COLUMNS)
return(df)
if __name__ == '__main__':
df = main()
df.to_csv(f'{today}HPD.csv', encoding='cp1251', errors='ignore', index=False)
You can combine for loops with Python's range() function.
The range() function provides a sequence of integers based upon the function's arguments.
range(start, stop[, step])
The start argument is the first value in the range. If range() is called with only one argument, then Python assumes start = 0.
The stop argument is the upper bound of the range. It is important to realize that this upper value is not included in the range.
Example:
for i in range(1, 6):
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/{i}-r/?page='
print(URL)
Output:
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page=
I experience dealing with multi tags/attributes in one loop and appending them to the DataFrame. More speicifcally, it concerns Place loop:
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
Appending it to the DataFrame yields only 1 result out of expected 30.
Thank you in advance.
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url + str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
sub_soup_txt = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = sub_soup_txt.find('div', {'id': 'car-attributes'})
soup2 = sub_soup_txt.find('div', {'id': 'vip-seller'})
tmp = []
places = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final['Places'] = pd.Series(places)
df_final.to_csv('auto_database.csv')
As you are adding places to the final df, this line (currently sitting in for pagenumber in ... for car in ...)
places = []
should go all the way up and out of the main for loop here:
frames = []
places = []
I have the following code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import requests
from requests import get
date = []
tourney_round = []
result = []
winner_odds = []
loser_odds = []
surface = []
players_and_tourney
response = get('http://www.tennisexplorer.com/player/humbert-e2553/?annual=all')
page_html = BeautifulSoup(response.text, 'html.parser')
results2018_containers = page_html.find_all('div', id = 'matches-2018-1-data')
for container in results2018_containers:
played_date_2018 = results2018_containers[0].findAll('td', class_ = 'first time')
for i in played_date_2018:
date.append(i.text)
string_2018 = '2018'
date = [x + string_2018 for x in date]
for container in results2018_containers:
rounds_2018 = results2018_containers[0].findAll('td', class_ = 'round')
for i in rounds_2018:
tourney_round.append(i.text)
for container in results2018_containers:
results_2018 = results2018_containers[0].findAll('td', class_ = 'tl')
for i in results_2018:
result.append(i.text)
for container in results2018_containers:
surfaces_2018 = results2018_containers[0].findAll('td', class_ = 's-color')
for i in surfaces_2018:
surface.append(i.find('span')['title'])
for container in results2018_containers:
odds_2018 = results2018_containers[0].findAll('td', class_ = 'course')
winner_odds_2018 = odds_2018[0:][::2]
for i in winner_odds_2018:
winner_odds.append(i.text)
loser_odds_2018 = odds_2018[1:][::2]
for i in loser_odds_2018:
loser_odds.append(i.text)
for container in results2018_containers:
namesandtourney_2018 = results2018_containers[0].findAll('td', class_ = 't-name')
for i in namesandtourney_2018:
players_and_tourney.append(i.text)
from itertools import chain, groupby, repeat
chainer = chain.from_iterable
def condition(x):
return x.startswith('\xa0')
elements = [list(j) for i, j in groupby(players_and_tourney, key=condition) if not i]
# create list of headers
headers = [next(j) for i, j in groupby(players_and_tourney, key=condition) if i]
# chain list of lists, and use repeat for headers
initial_df_2018 = pd.DataFrame({'Date': date,
'Surface': surface,
'Players': list(chainer(elements)),
'Tournament': list(chainer(repeat(i, j) for i, j in \
zip(headers, map(len, elements)))),
'Round': tourney_round,
'Result': result,
'Winner Odds': winner_odds,
'Loser Odds' : loser_odds})
initial_df_2018['Winner'], initial_df_2018['Loser'] =
initial_df_2018['Players'].str.split(' - ', 1).str
del initial_df_2018['Players']
initial_df_2018 = initial_df_2018[['Date','Surface','Tournament','Winner','Loser','Result','Winner Odds','Loser Odds']]
I want to create a loop that runs the code for every year starting from 2005. So basically, running a loop by replacing 2018 throughout the code by each year between 2005 an 2018. If possible, the code would run first for the year 2018, then 2017, and so on until 2005.
Edit: I added the code that i used to pull data for the year 2018, but I want to have a loop that will pull data for all the years that can be found on the page.
If I understood you correctly you want to complete the request for 2018, for all years between 2005-2018.
What I did was loop over your code for years in those range, replacing the id each time and adding all data to the list.
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
date_dict = {}
for year in range(2019, 1, -1):
date = []
string_id = "played-{}-data".format(year)
results_containers = page_html.find_all('div', id = string_id)
if (results_containers == None):
continue
for container in results_containers :
played_date = results_containers [0].findAll('td', class_ = 'plays')
for i in played_date :
date.append(i.text)
if not (year in date_dict):
date_dict[year] = []
date_dict[year] += date
You can store the year as an integer but still use it in a string.
for year in range(2018, 2004, -1):
print(f"Happy New Year {year}")
Other ways to include a number in a string are "Happy New Year {}".format(year) or "it is now " + str(year) + " more text".
Also, I don't think you do, but if someone finds this and really wants to "iterate a string" caesar ciphers are a good place to look.
There's no problem looping that, but you need to define how you want your results. I used a dictionary here, and i've turned your code into a function that I can call with variables:
def get_data(year):
date =[]
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
results_containers = page_html.find_all('div', id = 'played-{year}-data'.format(year))
for container in results_containers:
played_date = results_containers[0].findAll('td', class_ = 'plays')
for i in played_date:
date.append(i.text)
return date
Now all i have to do is create a range of possible years and call the function every time, this can be done as simply as:
all_data = {year: get_data(year) for year in range(2018, 2004, -1)}
Just use a for loop over a range. Something like:
date =[]
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
for year in range(2018, 2004, -1):
year_id = 'played-{}-data'.format(year)
results_containers = page_html.find_all('div', id=year_id)
...