Searching Dataframe for Specific Values to Be Stored - python

I'm new to programming and Python. I'm adopting code(https://github.com/rileypredum/East-Bay-Housing-Web-Scrape/blob/master/EB_Room_Prices.ipynb) to scrape Craiglist. My goal is to retrieve and store all the automotive posts in Chicago. I am able to store the Post Title, Post Time, Price, and Neighborhood. My next goal is to create a new column adding only the make of the vehicle, i.e. Toyota, Nissan, Honda, etc by searching the Post Title. How do I do this?
I believe this would be where I would add logic here: In [13]" for a variable "post_make" to search "post_title".
#build out the loop
from time import sleep
from random import randint
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np
#find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text)
pages = np.arange(0, results_total, 120)
iterations = 0
post_timing = []
post_hoods = []
post_title_texts = []
post_links = []
post_prices = []
for page in pages:
#get request
response = get("https://sfbay.craigslist.org/search/eby/roo?"
+ "s="
+ str(page)
+ "&hasPic=1"
+ "&availabilityMode=0")
sleep(randint(1,5))
#throw warning for status codes that are not 200
if response.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, response.status_code))
#define the html text
page_html = BeautifulSoup(response.text, 'html.parser')
#define the posts
posts = html_soup.find_all('li', class_= 'result-row')
#extract data item-wise
for post in posts:
if post.find('span', class_ = 'result-hood') is not None:
#posting date
#grab the datetime element 0 for date and 1 for time
post_datetime = post.find('time', class_= 'result-date')['datetime']
post_timing.append(post_datetime)
#neighborhoods
post_hood = post.find('span', class_= 'result-hood').text
post_hoods.append(post_hood)
#title text
post_title = post.find('a', class_='result-title hdrlnk')
post_title_text = post_title.text
post_title_texts.append(post_title_text)
#post link
post_link = post_title['href']
post_links.append(post_link)
post_price = post.a.text
post_prices.append(post_price)
iterations += 1
print("Finished iteration: " + str(iterations))
Trying to figure out how to show the output.
Current output in excel is:
posted, neighborhood, post title, url, price
My goal is to add "post make" after the price.
I'm also looking for advice on how to show output from Jupyter notebooks here.

It's rather tricky to pull that out. I gave it a shot using another package Spacy to try to pull out the entities that are linked to organisations/car companies. It's not perfect, but it's a start:
Code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy
nlp = spacy.load("en_core_web_sm")
req_url = 'https://chicago.craigslist.org/search/cta'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36'}
payload = {
's': '0',
'query': 'automotive',
'sort': 'rel'}
response = requests.get(req_url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
total_posts = int(soup.find('span',{'class':'totalcount'}).text)
pages = list(range(0, total_posts, 120))
iterations = 0
post_timing = []
post_hoods = []
post_title_texts = []
post_links = []
post_prices = []
post_makes = []
post_models = []
for page in pages:
payload = {
's': page,
'query': 'automotive',
'sort': 'rel'}
response = requests.get(req_url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
posts = soup.find_all('li', class_= 'result-row')
#extract data item-wise
for post in posts:
if post.find('span', class_ = 'result-hood') is not None:
#posting date
#grab the datetime element 0 for date and 1 for time
post_datetime = post.find('time', class_= 'result-date')['datetime']
post_timing.append(post_datetime)
#neighborhoods
post_hood = post.find('span', class_= 'result-hood').text
post_hoods.append(post_hood)
#title text
post_title = post.find('a', class_='result-title hdrlnk')
post_title_text = post_title.text
post_title_texts.append(post_title_text)
#post link
post_link = post_title['href']
post_links.append(post_link)
post_price = post.a.text.strip()
post_prices.append(post_price)
try:
# Used Spacy and Named Entity Recognition (NER) to pull out makes/models within the title text
post_title_text = post_title_text.replace('*', ' ')
post_title_text = [ each.strip() for each in post_title_text.split(' ') if each.strip() != '' ]
post_title_text = ' '.join( post_title_text)
doc = nlp(post_title_text)
model = [ent.text for ent in doc.ents if ent.label_ == 'PRODUCT']
make_model_list = [ent.text for ent in doc if ent.tag_ == 'NNP']
doc = nlp(' '.join(make_model_list))
make = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
post_make = make[0]
post_makes.append(post_make)
post_model = model[0]
post_models.append(post_model)
except:
post_makes.append('')
post_models.append('')
iterations += 1
print("Finished iteration: " + str(iterations))
data = list(zip(post_timing,post_hoods,post_title_texts,post_links,post_prices,post_makes,post_models))
df = pd.DataFrame(list(zip(post_timing,post_hoods,post_title_texts,post_links,post_prices,post_makes,post_models)),
columns = ['time','hood','title','link','price','make','model'])
Output:
print (df.head(20).to_string())
time hood title link price make model
0 2019-10-03 07:12 (TEXT 855-976-4304 FOR CUSTOM PAYMENT) 2015 Ford Focus SE Sedan 4D sedan Dk. Gray - F... https://chicago.craigslist.org/chc/ctd/d/chica... $11500 Ford Focus SE
1 2019-10-03 06:03 (EVERYBODY DRIVES IN SOUTH ELGIN) $174/mo [][][] 2013 Hyundai Sonata BAD CREDIT OK https://chicago.craigslist.org/nwc/ctd/d/south... $174 Sonata BAD
2 2019-10-03 00:04 (EVERYBODY DRIVES IN SOUTH ELGIN) $658/mo [][][] 2016 Jeep Grand Cherokee BAD CR... https://chicago.craigslist.org/nwc/ctd/d/south... $658 Hyundai
3 2019-10-02 21:04 (EVERYBODY DRIVES IN SOUTH ELGIN) $203/mo [][][] 2010 Chevrolet Traverse BAD CRE... https://chicago.craigslist.org/nwc/ctd/d/south... $203 Jeep Grand Cherokee BAD Traverse BAD
4 2019-10-02 20:24 (DENVER) 2017 Jeep Cherokee Latitude 4x4 4dr SUV SKU:60... https://chicago.craigslist.org/chc/ctd/d/denve... $8995 Cherokee
5 2019-10-02 20:03 ( Buy Here Pay Here!) Good Credit, Bad Credit, NO Credit = NO Problem https://chicago.craigslist.org/nwc/ctd/d/chica... $0 Chevrolet
6 2019-10-02 20:03 ( Buy Here Pay Here!) Aceptamos Matricula!!! Te pagan en efectivo?? ... https://chicago.craigslist.org/wcl/ctd/d/chica... $0 Jeep
7 2019-10-02 20:02 ( Buy Here Pay Here!) Good Credit, Bad Credit, No Credit = No Problem https://chicago.craigslist.org/chc/ctd/d/vista... $0 Credit Bad Credit
8 2019-10-02 20:00 ( Buy Here Pay Here!) Good Credit, Bad Credit, No Credit= No Problem https://chicago.craigslist.org/sox/ctd/d/chica... $0
9 2019-10-02 19:15 (* CHRYSLER * TOWN AND COUNTRY * WWW.YOURCHOI... 2013*CHRYSLER*TOWN & COUNTRY*TOURING LEATHER K... https://chicago.craigslist.org/nwc/ctd/d/2013c... $9499
10 2019-10-02 19:09 (*CADILLAC* *DTS* WWW.YOURCHOICEAUTOS.COM) 2008*CADILLAC*DTS*1OWNER LEATHER SUNROOF NAVI ... https://chicago.craigslist.org/sox/ctd/d/2008c... $5999 Credit Bad Credit
11 2019-10-02 18:59 (WAUKEGANAUTOAUCTION.COM OPEN TO PUBLIC OVER ... 2001 *GMC**YUKON* XL DENALI AWD 6.0L V8 1OWNER... https://chicago.craigslist.org/nch/ctd/d/2001-... $1200
12 2019-10-02 18:47 (*GMC *SAVANA *CARGO* WWW.YOURCHOICEAUTOS.COM) 1999 *GMC *SAVANA *CARGO*G2500 SHELVES CABINET... https://chicago.craigslist.org/sox/ctd/d/1999-... $2999 Credit Bad Credit
13 2019-10-02 18:04 ( Buy Here Pay Here!) GoodCredit, Bad Credit, No credit = No Problem https://chicago.craigslist.org/nwc/ctd/d/chica... $0
14 2019-10-02 18:05 ( Buy Here Pay Here!) Rebuild your credit today!!! https://chicago.craigslist.org/sox/ctd/d/chica... $0 CHRYSLER
15 2019-10-02 18:03 ( Buy Here Pay Here!) Rebuild your credit today!!! Repo? No Problem!... https://chicago.craigslist.org/chc/ctd/d/vista... $0
16 2019-10-02 17:59 (* ACURA * TL * WWW.YOURCHOICEAUTOS.COM) 2006 *ACURA**TL* LEATHER SUNROOF CD KEYLES ALL... https://chicago.craigslist.org/sox/ctd/d/2006-... $4499
17 2019-10-02 18:00 ( Buy Here Pay Here!) Buy Here Pay Here!!! We Make it Happen!! Bad C... https://chicago.craigslist.org/wcl/ctd/d/chica... $0
18 2019-10-02 17:35 (ST JOHN) 2009 NISSAN VERSA https://chicago.craigslist.org/nwi/ctd/d/saint... $4995
19 2019-10-02 17:33 (DENVER) 2013 Scion tC Base 2dr Coupe 6M SKU:065744 Sci... https://chicago.craigslist.org/chc/ctd/d/denve... $5995 GoodCredit Bad Credit

Related

How can you retrieve webpages based on URLs and convert each to a beautifulsoup object

So I am scraping a website I was able to get all the information thanks to Andrej Kesely, I was also able to syntheses URLs that downloaded the first 50 pages, however now I want to retrieve the webpages based on the URLs and convert them into a beautifulsoup and I also want to retrieve all the information and the URL(href) to access the detailed car information.
I am new to python and website scraping so I really don't know where to start but here is the code for that syntheses the first 50 pages of the website
from bs4 import BeautifulSoup
import requests
import os
for i in range(1, 50):
response = requests.get(f"https://jammer.ie/used-cars?page={i}&per-page=12")
with open(f"example{i}.html", "w" , encoding="utf-8") as fp:
fp.write(response.text)
urls = []
prices = []
makes = []
# for loop index by i
with open(f"example{i}.html", "r") as fp:
webpage = fp.read()
soup = BeautifulSoup(webpage, "html.parser")
tables = soup.find_all('div', {"class": "span-9 right-col"})
len(tables[0].contents)
for it in tables[0].contents[1:]:
if it == "\n":
continue
for jt in it.findall('div', class_="col-lg-4 col-md-12 car-listing"):
price = jt.find('p', class_="price").text
make = jt.find('h6', class_="car-make").text
url = f"https://jammer.ie/used-cars?page={i}&per-page=12"
urls.append(url)
prices
I know I must make a beautifulsoup object but I really don't know what to do if you could please explain what to do it would be great thanks
I want to have it where I'm able to
Retrieve the webpages based on these URLs and convert each into a beautifulsoup object and
Retrieve Car Manufacturing Year, Engine, Price, Dealer information (if it is available), and the URL (href) to access the detailed car information.
To iterate over multiple pages you can do:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://jammer.ie/used-cars?page={}&per-page=12"
all_data = []
for page in range(1, 3): # <-- increase number of pages here
soup = BeautifulSoup(requests.get(url.format(page)).text, "html.parser")
for car in soup.select(".car"):
info = car.select_one(".top-info").get_text(strip=True, separator="|")
make, model, year, price = info.split("|")
dealer_name = car.select_one(".dealer-name h6").get_text(
strip=True, separator=" "
)
address = car.select_one(".address").get_text(strip=True)
features = {}
for feature in car.select(".car--features li"):
k = feature.img["src"].split("/")[-1].split(".")[0]
v = feature.span.text
features[f"feature_{k}"] = v
all_data.append(
{
"make": make,
"model": model,
"year": year,
"price": price,
"dealer_name": dealer_name,
"address": address,
"url": "https://jammer.ie"
+ car.select_one("a[href*=vehicle]")["href"],
**features,
}
)
df = pd.DataFrame(all_data)
# prints sample data to screen:
print(df.tail().to_markdown(index=False))
# saves all data to CSV
df.to_csv("data.csv", index=False)
Prints:
make
model
year
price
dealer_name
address
url
feature_speed
feature_engine
feature_transmission
feature_owner
feature_door-icon1
feature_petrol5
feature_paint
feature_hatchback
Skoda
Fabia
2014
€7,500
Blue Diamond Cars
Co. Cork
https://jammer.ie/vehicle/165691-skoda-fabia-2014
128627 miles
1.2 litres
Manual
2 previous owners
4 doors
Petrol
Beige
Estate
Ford
Kuga
2016
€16,750
Ballincollig Motor Company / Trident
Co. Cork
https://jammer.ie/vehicle/165690-ford-kuga-2016
99000 miles
2.0 litres
Manual
1 previous owners
5 doors
Diesel
Grey
MPV
Hyundai
i40
2015
Price on application
Ballincollig Motor Company / Trident
Co. Cork
https://jammer.ie/vehicle/165689-hyundai-i40-2015
98000 miles
1.7 litres
Manual
1 previous owners
5 doors
Diesel
Black
Estate
Dacia
Sandero
2016
€9,950
Ballincollig Motor Company / Trident
Co. Cork
https://jammer.ie/vehicle/165688-dacia-sandero-2016
43000 miles
nan
Manual
3 previous owners
4 doors
Petrol
Blue
Hatchback
Ford
Fiesta
2016
Price on application
Ballincollig Motor Company / Trident
Co. Cork
https://jammer.ie/vehicle/165687-ford-fiesta-2016
45000 miles
1.0 litres
Manual
2 previous owners
5 doors
Petrol
Silver
Hatchback
and saves data.csv (screenshot from LibreOffice):

Beautifulsoup taking too much time to execute in the code

I am trying to scrape a website:-
https://media.info/newspapers/titles
This website has a list of newspapers from A to Z. I first have to scrape all the URLs and then scrape some more information from each newspaper.
Below is my code to scrape the URLs of all the newspapers starting from A to Z:-
driver.get('https://media.info/newspapers/titles')
time.sleep(2)
page_title = []
pages = driver.find_elements(By.XPATH,"//div[#class='pages']//a")
for i in pages:
page_title.append(i.get_attribute("href"))
names = []
for i in page_title:
driver.get(i)
time.sleep(1)
name = driver.find_elements(By.XPATH,"//div[#class='info thumbBlock']//a")
for i in name:
names.append(i.get_attribute("href"))
len(names) :-> 1688
names[0:5]
['https://media.info/newspapers/titles/abergavenny-chronicle',
'https://media.info/newspapers/titles/abergavenny-free-press',
'https://media.info/newspapers/titles/abergavenny-gazette-diary',
'https://media.info/newspapers/titles/the-abingdon-herald',
'https://media.info/newspapers/titles/academies-week']
moving further I need to scrape some information like owner, postal_Address, email, etc and I wrote the below code.
test = []
c = 0
for i in names:
driver.get(i)
time.sleep(2)
r = requests.get(i)
soup = BeautifulSoup(r.content,'lxml')
try:
name = driver.find_element(By.XPATH,"//*[#id='mainpage']/article/div[3]/h1").text
try:
twitter = driver.find_element(By.XPATH,"//*[#id='mainpage']/article/table[3]/tbody/tr/td[1]/a").text
except:
twitter = None
try:
twitter_followers = driver.find_element(By.XPATH,"//*[#id='mainpage']/article/table[3]/tbody/tr/td[1]/small").text.replace(' followers','').lstrip('(').rstrip(')')
except:
twitter_followers = None
people = []
try:
persons = driver.find_elements(By.XPATH,"//div[#class='columns']")
for i in persons:
people.append(i.text)
except:
people.append(None)
try:
owner = soup.select_one('th:contains("Owner") + td').text
except:
owner = None
try:
postal_address = soup.select_one('th:contains("Postal address") + td').text
except:
postal_address = None
try:
Telephone = soup.select_one('th:contains("Telephone") + td').text
except:
Telephone = None
try:
company_website = soup.select_one('th:contains("Official website") + td > a').get('href')
except:
company_website = None
try:
main_email = soup.select_one('th:contains("Main email") + td').text
except:
main_email = None
try:
personal_email = soup.select_one('th:contains("Personal email") + td').text
except:
personal_email = None
r2 = requests.get(company_website)
soup2 = BeautifulSoup(r2.content,'lxml')
try:
is_wordpress = soup2.find("meta",{"name":"generator"}).get('content')
except:
is_wordpress = None
news_Data = {
"Name": name,
"Owner": owner,
"Postal Address": postal_address,
"main Email":main_email,
"Telephone": Telephone,
"Personal Email": personal_email,
"Company Wesbite": company_website,
"Twitter_Handle": twitter,
"Twitter_Followers": twitter_followers,
"People":people,
"Is Wordpress?":is_wordpress
}
test.append(news_Data)
c=c+1
print("completed",c)
except Exception as Argument:
print(f"There is an exception with {i}")
pass
I am using both Selenium and BesutifulSoup with requests to scrape the data. The code is fulfilling the requirements.
Firstly, is it a good practice to use it in this manner like using selenium and soup in the same code?
Secondly, the code is taking too much time. is there any alternate way to reduce the runtime of the code?
BeautifulSoup is not slow: making requests and waiting for responses is slow.
You do not necessarily need selenium/chromedriver setup for this task, it's doable with requests (or other python library).
Yes, there are ways to speed it up, however keep in mind you are making requests to a server, which might become overwhelmed if you send too many requests at once, or it might block you.
Here is an example without selenium, which will accomplish what you're after:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
r = s.get('https://media.info/newspapers/titles')
soup = bs(r.text)
letter_links = [x.get('href') for x in soup.select_one('div.pages').select('a')]
newspaper_links = []
for x in tqdm(letter_links):
soup = bs(s.get(x).text)
ns_links = soup.select_one('div.columns').select('a')
for n in ns_links:
newspaper_links.append((n.get_text(strip=True), 'https://media.info/' + n.get('href')))
detailed_infos = []
for x in tqdm(newspaper_links[:50]):
soup = bs(s.get(x[1]).text)
owner = soup.select_one('th:contains("Owner")').next_sibling.select_one('a').get_text(strip=True) if soup.select_one('th:contains("Owner")') else None
website = soup.select_one('th:contains("Official website")').next_sibling.select_one('a').get_text(strip=True) if soup.select_one('th:contains("Official website")') else None
detailed_infos.append((x[0], x[1], owner, website))
df = pd.DataFrame(detailed_infos, columns = ['Newspaper', 'Info Url', 'Owner', 'Official website'])
print(df)
Result in terminal:
Newspaper Info Url Owner Official website
0 Abergavenny Chronicle https://media.info//newspapers/titles/abergavenny-chronicle Tindle Newspapers abergavenny-chronicle-today.co.uk
1 Abergavenny Free Press https://media.info//newspapers/titles/abergavenny-free-press Newsquest Media Group freepressseries.co.uk
2 Abergavenny Gazette & Diary https://media.info//newspapers/titles/abergavenny-gazette-diary Tindle Newspapers abergavenny-chronicle-today.co.uk/tn/index.cfm
3 The Abingdon Herald https://media.info//newspapers/titles/the-abingdon-herald Newsquest Media Group abingdonherald.co.uk
4 Academies Week https://media.info//newspapers/titles/academies-week None academiesweek.co.uk
5 Accrington Observer https://media.info//newspapers/titles/accrington-observer Reach plc accringtonobserver.co.uk
6 Addlestone and Byfleet Review https://media.info//newspapers/titles/addlestone-and-byfleet-review Reach plc woking.co.uk
7 Admart & North Devon Diary https://media.info//newspapers/titles/admart-north-devon-diary Tindle Newspapers admart.me.uk
8 AdNews Willenhall, Wednesbury and Darlaston https://media.info//newspapers/titles/adnews-willenhall-wednesbury-and-darlaston Reach plc reachplc.com
9 The Advertiser https://media.info//newspapers/titles/the-advertiser DMGT dmgt.co.uk
10 Aintree and Maghull Champion https://media.info//newspapers/titles/aintree-and-maghull-champion Champion Media group champnews.com
11 Airdrie & Coatbridge World https://media.info//newspapers/titles/airdrie-coatbridge-world Reach plc icLanarkshire.co.uk
12 Airdrie and Coatbridge Advertiser https://media.info//newspapers/titles/airdrie-and-coatbridge-advertiser Reach plc acadvertiser.co.uk
13 Aire Valley Target https://media.info//newspapers/titles/aire-valley-target Newsquest Media Group thisisbradford.co.uk
14 Alcester Chronicle https://media.info//newspapers/titles/alcester-chronicle Newsquest Media Group redditchadvertiser.co.uk/news/alcester
15 Alcester Standard https://media.info//newspapers/titles/alcester-standard Bullivant Media redditchstandard.co.uk
16 Aldershot Courier https://media.info//newspapers/titles/aldershot-courier Guardian Media Group aldershot.co.uk
17 Aldershot Mail https://media.info//newspapers/titles/aldershot-mail Guardian Media Group aldershot.co.uk
18 Aldershot News & Mail https://media.info//newspapers/titles/aldershot-news-mail Reach plc gethampshire.co.uk/aldershot
19 Alford Standard https://media.info//newspapers/titles/alford-standard JPI Media skegnessstandard.co.uk
20 Alford Target https://media.info//newspapers/titles/alford-target DMGT dmgt.co.uk
21 Alfreton and Ripley Echo https://media.info//newspapers/titles/alfreton-and-ripley-echo JPI Media jpimedia.co.uk
22 Alfreton Chad https://media.info//newspapers/titles/alfreton-chad JPI Media chad.co.uk
23 All at Sea https://media.info//newspapers/titles/all-at-sea None allatsea.co.uk
24 Allanwater News https://media.info//newspapers/titles/allanwater-news HUB Media allanwaternews.co.uk
25 Alloa & Hillfoots Shopper https://media.info//newspapers/titles/alloa-hillfoots-shopper Reach plc reachplc.com
26 Alloa & Hillfoots Advertiser https://media.info//newspapers/titles/alloa-hillfoots-advertiser Dunfermline Press Group alloaadvertiser.com
27 Alloa and Hillfoots Wee County News https://media.info//newspapers/titles/alloa-and-hillfoots-wee-county-news HUB Media wee-county-news.co.uk
28 Alton Diary https://media.info//newspapers/titles/alton-diary Tindle Newspapers tindlenews.co.uk
29 Andersonstown News https://media.info//newspapers/titles/andersonstown-news Belfast Media Group irelandclick.com
30 Andover Advertiser https://media.info//newspapers/titles/andover-advertiser Newsquest Media Group andoveradvertiser.co.uk
31 Anfield and Walton Star https://media.info//newspapers/titles/anfield-and-walton-star Reach plc icliverpool.co.uk
32 The Anglo-Celt https://media.info//newspapers/titles/the-anglo-celt None anglocelt.ie
33 Annandale Herald https://media.info//newspapers/titles/annandale-herald Dumfriesshire Newspaper Group dng24.co.uk
34 Annandale Observer https://media.info//newspapers/titles/annandale-observer Dumfriesshire Newspaper Group dng24.co.uk
35 Antrim Times https://media.info//newspapers/titles/antrim-times JPI Media antrimtoday.co.uk
36 Arbroath Herald https://media.info//newspapers/titles/arbroath-herald JPI Media arbroathherald.com
37 The Arden Observer https://media.info//newspapers/titles/the-arden-observer Bullivant Media ardenobserver.co.uk
38 Ardrossan & Saltcoats Herald https://media.info//newspapers/titles/ardrossan-saltcoats-herald Newsquest Media Group ardrossanherald.com
39 The Argus https://media.info//newspapers/titles/the-argus Newsquest Media Group theargus.co.uk
40 Argyllshire Advertiser https://media.info//newspapers/titles/argyllshire-advertiser Oban Times Group argyllshireadvertiser.co.uk
41 Armthorpe Community Newsletter https://media.info//newspapers/titles/armthorpe-community-newsletter JPI Media jpimedia.co.uk
42 The Arran Banner https://media.info//newspapers/titles/the-arran-banner Oban Times Group arranbanner.co.uk
43 The Arran Voice https://media.info//newspapers/titles/the-arran-voice Independent News Ltd voiceforarran.com
44 The Art Newspaper https://media.info//newspapers/titles/the-art-newspaper None theartnewspaper.com
45 Ashbourne News Telegraph https://media.info//newspapers/titles/ashbourne-news-telegraph Reach plc ashbournenewstelegraph.co.uk
46 Ashby Echo https://media.info//newspapers/titles/ashby-echo Reach plc reachplc.com
47 Ashby Mail https://media.info//newspapers/titles/ashby-mail DMGT thisisleicestershire.co.uk
48 Ashfield Chad https://media.info//newspapers/titles/ashfield-chad JPI Media chad.co.uk
49 Ashford Adscene https://media.info//newspapers/titles/ashford-adscene DMGT thisiskent.co.uk
You can extract more information for each newspaper, as you wish - the above is just an example, going through the first 50 newspapers. Now if you want a multithreaded/async solution, I recommend you read the following, and apply it to your own scenario:
BeautifulSoup getting href of a list - need to simplify the script - replace multiprocessing
Lastly, Requests docs can be found here: https://requests.readthedocs.io/en/latest/
BeautifulSoup docs: https://beautiful-soup-4.readthedocs.io/en/latest/index.html
For TQDM: https://pypi.org/project/tqdm/
names = []
for letter in string.ascii_lowercase:
page = requests.get("https://media.info/newspapers/titles/starting-with/{}".format(letter))
soup = BeautifulSoup(page.content, "html.parser")
for i in soup.find_all("a"):
if i['href'].startswith("/newspapers/titles/"):
names.append(i['href'])

How to scrape website while iterate on multiple pages

Trying to scrape this website using python beautifulsoup:
https://www.leandjaya.com/katalog
having some challenges in navigating the multiple pages of the website and scrape it
using python, this website has 11 pages, and curious to know the best option to
achieve this like use for loop and will break the loop if the page doesnt exist.
this is my initial code, I have set a big number 50, however seems this is not a good option.
page = 1
while page != 50:
url=f"https://www.leandjaya.com/katalog/ss/1/{page}/"
main = requests.get(url)
pmain = BeautifulSoup(main.text,'lxml')
page = page + 1
Sample output:
https://www.leandjaya.com/katalog/ss/1/1/
https://www.leandjaya.com/katalog/ss/1/2/
https://www.leandjaya.com/katalog/ss/1/3/
https://www.leandjaya.com/katalog/ss/1/<49>/
This is one way to extract that info and display it in a dataframe, based on an unknown number of pages with data:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
cars_list = []
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
counter = 1
while True:
try:
print('page:', counter)
url = f'https://www.leandjaya.com/katalog/ss/1/{counter}/'
r = s.get(url)
soup = bs(r.text, 'html.parser')
cars_cards = soup.select('div.item')
if len(cars_cards) < 1:
print('all done, no cars left')
break
for car in cars_cards:
car_name = car.select_one('div.item-title').get_text(strip=True)
car_price = car.select_one('div.item-price').get_text(strip=True)
cars_list.append((car_name, car_price))
counter = counter + 1
except Exception as e:
print('all done')
break
df = pd.DataFrame(cars_list, columns = ['Car', 'Price'])
print(df)
Result:
page: 1
page: 2
page: 3
page: 4
page: 5
page: 6
page: 7
page: 8
page: 9
page: 10
page: 11
page: 12
all done, no cars left
Car Price
0 HONDA CRV 4X2 2.0 AT 2001 DP20jt
1 DUJUAL XPANDER 1.5 GLS 2018 MANUAL DP53jt
2 NISSAN JUKE 1.5 CVT 2011 MATIC DP33jt
3 Mitsubishi Xpander 1.5 Exceed Manual 2018 DP50jt
4 BMW X1 2.0 AT SDRIVE 2011 DP55jt
... ... ...
146 Daihatsu Sigra 1.2 R AT DP130jt
147 Daihatsu Xenia Xi 2010 DP85jt
148 Suzuki Mega Carry Pick Up 1.5 DP90jt
149 Honda Mobilio Tipe E Prestige DP150jt
150 Honda Freed Tipe S Rp. 170jtRp. 165jt
151 rows × 2 columns
The relevant documentations for the packages used above can be found at:
https://beautiful-soup-4.readthedocs.io/en/latest/index.html
https://requests.readthedocs.io/en/latest/
https://pandas.pydata.org/pandas-docs/stable/index.html

Scraping OpenTable website using python BeautifulSoup

I'm trying to scrape the Open Table site with the use of Beautiful Soup.The code runs successfully, but the result I am getting has a lot of NA columns. Here is the code.
def parse_html(html):
data, item = pd.DataFrame(), {}
soup = BeautifulSoup(html, 'lxml')
for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
item['name'] = resto.find('span', class_='rest-row-name-text').text
booking = resto.find('div', class_='booking')
item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'
rating = resto.select('div.all-stars.filled')
item['rating'] = int(re.search('\d+', rating[0].get('style')).group()) if rating else 'NA'
reviews = resto.find('span', class_='star-rating-text--review-text')
item['reviews'] = int(re.search('\d+', reviews.text).group()) if reviews else 'NA'
item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine').text
item['location'] = resto.find('span', class_='rest-row-meta--location').text
data[i] = pd.Series(item)
return data.T
restaurants = pd.DataFrame()
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)
while True:
sleep(1)
new_data = parse_html(driver.page_source)
if new_data.empty:
break
restaurants = pd.concat([restaurants, new_data], ignore_index=True)
print(len(restaurants))
# driver.find_element_by_link_text('Next').click()
driver.close()
restaurants.to_csv('results.csv', index=False)
print(restaurants)
and the results:
name bookings rating reviews price cuisine location
0 IL Carino Restaurant 1 NA NA 3 Upper East Side
1 French Roast Uptown 10 NA NA 3 Upper West Side
2 The Mermaid Inn Uptown 72 NA NA 3 Upper West Side
3 Cafe Du Soleil 101 NA NA 2 Upper West Side
4 The Leopard at des Artistes 24 NA NA 4 Upper West Side
Any recommendation or suggestion is appreciated.
I don't see on this page
rating = resto.select('div.all-stars.filled')
and code also can't find it - so you get NA for rating
But this gives me strings like 4.5 stars out of 5
rating = resto.select('.star-rating .star-rating-score')
#print(rating)
item['rating'] = rating[0]['aria-label'] if rating else 'NA'
I don't see on this page
resto.find('span', class_='star-rating-text--review-text')
and code also can't find it - so you get NA for reviews
But this gives me strings like Awesome, Exceptional
reviews = resto.select('div.review-rating-text span')
#print(reviews)
item['reviews'] = reviews[0].text if reviews else 'NA'
There are two elements with class 'rest-row-meta--cuisine' and you get first so you get $$$$
item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine').text
but you should use find_all to get both and later use [-1] to get last one
item['cuisine'] = resto.find_all('span', class_='rest-row-meta--cuisine')[-1].text
and this gives me
Pizzeria
Italian
Sushi
Steak
Contemporary Italian
Pizzeria
American
Italian
American
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
import re
def parse_html(html):
data, item = pd.DataFrame(), {}
soup = BeautifulSoup(html, 'lxml')
for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
item['name'] = resto.find('span', class_='rest-row-name-text').text
booking = resto.find('div', class_='booking')
item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'
rating = resto.select('.star-rating .star-rating-score')
#print(rating)
item['rating'] = rating[0]['aria-label'] if rating else 'NA'
reviews = resto.find('span', class_='star-rating-text--review-text')
reviews = resto.select('div.review-rating-text span')
#print(reviews)
item['reviews'] = reviews[0].text if reviews else 'NA'
item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
item['cuisine'] = resto.find_all('span', class_='rest-row-meta--cuisine')[-1].text
#print(item['cuisine'])
item['location'] = resto.find('span', class_='rest-row-meta--location').text
data[i] = pd.Series(item)
return data.T
restaurants = pd.DataFrame()
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)
while True:
sleep(1)
new_data = parse_html(driver.page_source)
if new_data.empty:
break
restaurants = pd.concat([restaurants, new_data], ignore_index=True)
print(len(restaurants))
# driver.find_element_by_link_text('Next').click()
#driver.close()
restaurants.to_csv('results.csv', index=False)
print(restaurants[['rating', 'reviews', 'cuisine']])
rating reviews cuisine
0 4.5 stars out of 5 Awesome Italian
1 4.5 stars out of 5 Awesome French American
2 4.7 stars out of 5 Exceptional Italian
3 4.8 stars out of 5 Exceptional Seafood
4 4.4 stars out of 5 Awesome French
.. ... ... ...
95 4.7 stars out of 5 Exceptional Contemporary Italian
96 4 stars out of 5 Excellent Pizzeria
97 NA NA American
98 4.7 stars out of 5 Exceptional Italian
99 4.4 stars out of 5 Awesome American

Scraping e-commerce in python - cannot fetch product categories and total amounts

So far my code can scrape the number of items on sale in the category Charms. But I cannot make it print out the name of the category.
The site uses an infinite scroller - but I managed to identify where the sites are and thus the site URL contains {} which is filled out with the while loop.
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
while page<=1000:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
category_tags = soup.select('span.breadcrumb-element')
return Total_items
return category_tags
if __name__ == '__main__':
page = 0
product_list = []
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items)
print(category_tags)
Here's what I need:
I need to print out the category of the scraped items, which can be found in using this line:
category_tags = soup.select('span.breadcrumb-element')
But I cannot make it print somehow.
While we're at it, how can I make the code print out ALL the items and not just the items on sale?
Thank you.
EDIT:
So building one of the guys' code I ended up with this.
import requests
from bs4 import BeautifulSoup
import re
url1 = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
url2 = "https://us.pandora.net/en/bracelets/?sz=30&start={}&format=page-element"
url3 = "https://us.pandora.net/en/rings/?sz=30&start={}&format=page-element"
url4 = "https://us.pandora.net/en/necklaces/?sz=30&start={}&format=page-element"
url5 = "https://us.pandora.net/en/earrings/?sz=30&start={}&format=page-element"
#res = requests.get(link.format(url1),headers={"User-Agent":"Mozilla/5.0"})
soup1 = BeautifulSoup(requests.get(url1.format(0)).text, 'lxml')
soup2 = BeautifulSoup(requests.get(url2.format(0)).text, 'lxml')
soup3 = BeautifulSoup(requests.get(url3.format(0)).text, 'lxml')
soup4 = BeautifulSoup(requests.get(url4.format(0)).text, 'lxml')
soup5 = BeautifulSoup(requests.get(url5.format(0)).text, 'lxml')
total_items1 = ''.join(re.findall(r'\d', soup1.select_one('span.products-count').text))
total_items2 = ''.join(re.findall(r'\d', soup2.select_one('span.products-count').text))
total_items3 = ''.join(re.findall(r'\d', soup3.select_one('span.products-count').text))
total_items4 = ''.join(re.findall(r'\d', soup4.select_one('span.products-count').text))
total_items5 = ''.join(re.findall(r'\d', soup5.select_one('span.products-count').text))
#categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')
#total_items_sale1 = ''.join(re.findall(r'\d', soup1.select_one('.grid-tile .price-standard')))
#total_items_sale1
#total_items_sale1
#total_items_sale1
#total_items_sale1
#print('Categories:')
#for category in categories:
#print('\t{}'.format(category))
print('\nTotal Charms: {}'.format(total_items1))
print('\nTotal Bracelets: {}'.format(total_items2))
print('\nTotal Rings: {}'.format(total_items3))
print('\nTotal Necklaces: {}'.format(total_items4))
print('\nTotal Earrings: {}'.format(total_items5))
I know it looks horrible. How can we shorten it?
Looking at the result from the server, you don't have to loop through all pages. All the info you have on one page:
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
sale_url = "https://us.pandora.net/en/sale/sale-charms/?sz=30&start={}&format=page-element"
soup = BeautifulSoup(requests.get(url.format(0)).text, 'lxml')
sale_soup = BeautifulSoup(requests.get(sale_url.format(0)).text, 'lxml')
total_items = soup.select_one('#products_count')['value']
total_sale_items = sale_soup.select_one('#products_count')['value']
categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')]
print('Categories:')
for category in categories:
print('\t{}'.format(category))
print('\nTotal items: {}'.format(total_items))
print('Total sale items: {}'.format(total_sale_items))
Prints:
Categories:
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
$0 - $50
$50 - $100
$100 - $150
$150 & Over
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
Total items: 959
Total sale items: 376
can't have 2 returns there. The function stops after that first return, so if you want to return multiple objects, you can put that in one line. You also need to append that within a list within the loop. You have that outside of your loop. Note, I change it from 1000 to 300 to just test it.
Secondly, I think what you want is the text.
To print all the items, you'll need to get each item, not just the ones with 'price-standard'
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
categories = []
while page<=300:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
print(page)
category_tags = soup.select('span.breadcrumb-element')[0]
try:
categories.append(category_tags.text)
except:
categories.append('N/A')
return Total_items, categories
page = 0
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items[0])
print(total_items[1])
Here's how you can go about getting the whole products:
def fetch_items(link,page):
Total_items = 0
names = []
categories = []
prices = []
sales = []
while page<=300:
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
products = soup.find_all("li", class_=lambda value: value and value.startswith("grid-tile"))
for each in products:
Total_items += 1
category = each.find('div', {'class':'product-tile'})['data-cgid']
name = each.find('div', {'class':'product-name'}).text.strip()
price = each.find('div', {'class':'product-pricing'}).text.strip()
sale_price = each.find('span', {'class':'price-sales'}).text.strip()
names.append(name)
categories.append(category)
prices.append(price)
sales.append(sale_price)
print(page)
page+=30
return Total_items, names, categories, prices, sales
results = fetch_items(url,page)
Not Sure how you want those results though. But you can dump that into a table if you'd like:
import pandas as pd
df = pd.DataFrame(
{'name':results[1],
'category':results[2],
'price':results[3],
'sale':results[4]})
Output:
print (df.head(10).to_string())
name category price sale
0 American Icons Dangle Charm charms $60.00 $60.00
1 Disney Pixar, Toy Story, Buzz Lightyear Dangle... charms $70.00 $70.00
2 Disney Pixar, Toy Story, Woody Dangle Charm charms $60.00 $60.00
3 Spinning Globe Dangle Charm charms $60.00 $60.00
4 Elephant Charm charms $45.00 $45.00
5 Canada Dangle Charm, Pandora Rose™ charms $65.00 $65.00
6 Sparkling Monkey Charm charms $70.00 $70.00
7 Propeller Plane Dangle Charm charms $55.00 $55.00
8 Spotted Heart Charm charms $50.00 $50.00
9 Pink Travel Bag Charm charms $50.00 $50.00

Categories