dataframe from 2 classes beautifulsoup

dataframe from 2 classes beautifulsoup - python

I want to extract Arbeitsatmosphare rank and Stadt information based on review data from below website for ALL pages, so desired output should be as in example below
Arbeitsatmosphare | Stadt
1. 4.00 | Berlin
2. 5.00 | Frankfurt
3. 3.00 | Munich
4. 5.00 | Berlin
5. 4.00 | Berlin
Below code extracts pro data from website from all pages and works fine. I tried to update it and to add 2 lists, Arbeitsatmosphare rank and Stadt in it and break cycle if Arbeitsatmosphare rank info is missing, but my code is not working. Can you help?
pro = []
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
new_comments = [
pro.find_next_sibling('p').get_text()
for pro in soup.find_all('h2', text='Pro')
]
if not new_comments:
print(f"No more comments. Page: {page}")
break
pro += new_comments
print(pro)
#print(len(pro))
page += 1
print(pro)
UPD
Adding my code that is not working, however I think that there should be more simple solution
Arbeit = []
Stadt=[]
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
new_comments1 = [
Arbeit.find_next_sibling('span').get_text()
for Arbeit in soup.find_all('span', text='Arbeitsatmosphäre')
]
new_comments2 = [
Stadt.find_next_sibling('div').get_text()
for Stadt in soup.find_all('div', text='Stadt')
]
if not new_comments1:
print(f"No more comments. Page: {page}")
break
Arbeit += new_comments1
Stadt += new_comments2
print(Arbeit)
print(Stadt)
#print(len(pro))
page += 1

You can try:
import requests
from bs4 import BeautifulSoup
import pandas as pd
arbeit = []
firma = []
stadt = []
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
rating_tags = article.find_all('span', {'class' : 'rating-badge'})
arbeit.append(rating_tags[0].text.strip())
detail_div = article.find_all('div', {'class' : 'review-details'})[0]
nodes = detail_div.find_all('li')
firma_node = nodes[0]
stadt_node = nodes[1]
firma_node_div = firma_node.find_all('div')
firma_name = firma_node_div[1].text.strip()
firma.append(firma_name)
stadt_node_div = stadt_node.find_all('div')
stadt_name = stadt_node_div[1].text.strip()
stadt.append(stadt_name)
page += 1
pagination = soup.find_all('div', {'class' : 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({'Arbeitsatmosphäre' : arbeit, 'Stadt' : stadt})
print(df)

Related

How to get content by table row and if non applicable append "None"?

I have a problem when iterating through many links. I scrape according to columns css selector. however, as it seems there is among all links not a rating for every player. How do I manage that I get a "None" for the home_rating list when there is among the eleven starting squad no rating available in a specific "player row".
I basically need to scrape all column entries per row. thanks a lot for your support.
gamedays_url = range(1,35)
url_list = []
daylinks = []
for gameday in gamedays_url:
url = "https://www.transfermarkt.de/premier-league/spieltag/wettbewerb/L1/plus/?saison_id=2018&spieltag=" + str(gameday)
url_list.append(url)
response = requests.get(url, headers={'User-Agent': 'Custom5'})
gameLinks = []
for i in range(len(url_list)):
page = url_list
tree = requests.get(page[i], headers = {'User-Agent': 'Custom5'})
soup_2 = BeautifulSoup(tree.content, 'html.parser')
links_2 = soup_2.find_all("a", {"class": "liveLink"}, href=re.compile("spielbericht"))
for j in range(len(links_2)):
gameLinks.append(links_2[j].get('href').split('/')[4])
for j in range(len(gameLinks)):
gameLinks[j] = "https://www.transfermarkt.de/spiele/aufstellung/spielbericht/" + gameLinks [j]
home_id = []
home_name = []
homerating = []
for p in range(len(gameLinks)):
page = gameLinks[p]
response = requests.get(page, headers={'User-Agent': 'Custom5'})
lineup_data = response.text
soup = BeautifulSoup(lineup_data, 'html.parser')
test =soup.find('div', class_='responsive-table')
for homeid in test.find_all('a', href=re.compile('profil/spieler')):
home_id.append(homeid.get('href').split('/')[4])
for homename in test.find_all('a', href=re.compile('profil/spieler')):
home_name.append(homename.get('href').split('/')[1])
for grade in test.find_all('span', class_=None):
homerating.append(grade.text.split()[0])
homerating.append(None)

Try to check if your selected element is available and scrape the text alse set it to None:
row.select_one('span:not([class])').get_text(strip=True) if row.select('span:not([class])') else None
Also try to work with structured dicts instead of list.
Example
import requests
from bs4 import BeautifulSoup
data = []
for gameday in range(1,3):
url = "https://www.transfermarkt.de/premier-league/spieltag/wettbewerb/L1/plus/?saison_id=2018&spieltag=" + str(gameday)
response = requests.get(url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(response.content)
for a in soup.select('a.liveLink[href*="spielbericht"]'):
report_url = 'https://www.transfermarkt.de/spiele/aufstellung/spielbericht/'+a.get('href').split('/')[-1]
response = requests.get(report_url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(response.text)
for row in soup.select_one('table.items').select('tr:has(table)'):
data.append({
'home_id': row.select_one('a').get('href').split('/')[-1],
'home_name': row.select_one('a img').get('title'),
'home_rating': row.select_one('span:not([class])').get_text(strip=True) if row.select('span:not([class])') else None
})
data
Output
[...{'home_id': '45672', 'home_name': 'Kevin Trapp', 'home_rating': '3,4'},{'home_id': '256866', 'home_name': 'Carlos Salcedo', 'home_rating': None},{'home_id': '58178', 'home_name': 'David Abraham', 'home_rating': '3,4'}, {'home_id': '146258', 'home_name': 'Jetro Willems', 'home_rating': '5,5'},...]

Why does scraper stop at the 12th image?

So I have to scrape all the products from this website's shop ( https://bewellstore.ro/shop/), but my code stops at the 12th photo. I have made a version for websites with multiple shop pages where I take them all in a for loop, but since here it's only one page I thought that isn't necessary.
Any idea why my code stops at the 12th product?
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
folder = 'beWell_images'
os.makedirs(folder, exist_ok=True)
root_folder = os.getcwd()
baseurl = 'https://bewellstore.ro/shop/'
# an array for all the product links
product_links = []
# going through all the pages of the shop
url = 'https://bewellstore.ro/shop/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
product_list = soup.find_all('div', class_= 'loop-product-inner')
print (product_list)
# taking all the links to each product page
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
# appending the links previously taken to the array
print(product_links)
product_items_list = []
i = 0
d = {} # use as set()
os.chdir(folder)
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
# --- before `for`-loop ---
downloaded = []
# --- `for`-loop ---
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i += 1
name = str(i) +'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
# --- after `for`-loop ---
# storing all the infos about this product
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str + img
else:
img_str = img_str + img + '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
os.chdir(root_folder)
# os.chdir('output')
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)

That's because this webpage uses pagination (with 12 products per page) and each page gets loaded only when you scroll. You will have to use selenium to scroll the page.
But if you only want to use beautifulsoup then there is a work around.
The URL for each page looks like this
https://bewellstore.ro/shop/page/<page_no>/
Example:
1st page: https://bewellstore.ro/shop/page/1/
2nd page: https://bewellstore.ro/shop/page/2/
You could make a request to each of the above URLs and scrape your data using beautifulsoup.

You can try this for the all pages
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
import requests
product_links = []
headers = {
'authority': 'bewellstore.ro',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'sec-ch-ua-platform': '"Linux"',
'accept': '*/*',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://bewellstore.ro/shop/',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'fp_session=new; mc_landing_site=https://bewellstore.ro/shop/; _omappvp=i5rIyW2xsMFKIu3uhQtmFj1TN9jw7aKjO8dgy3SVvWMhAj30NvKFrBXfJLe3dQK6ZdbB4FezbrwFWPGLdKrsj1A1vqN2PRLI; _omappvs=1634795539874; _clck=1f7zptk|1|evr|0; _ga=GA1.2.2117949575.1634795541; _gid=GA1.2.1155690725.1634795541; _fbp=fb.1.1634795541140.1266696245; PHPSESSID=94b6b1996b0b5e831d898c192b4bca06; _clsk=2489zg|1634795542054|1|1|e.clarity.ms/collect; yith_wcwl_session_d235bd7d63b3a120c05ba3c90256789a=%7B%22session_id%22%3A%222e40c31b1503902767c5327edd3cf926%22%2C%22session_expiration%22%3A1637387542%2C%22session_expiring%22%3A1637383942%2C%22cookie_hash%22%3A%2249a81940bd8d39b2f894021c16333e6f%22%7D; omSeen-dwf9rgtvzzrhqylccaag=1634795583943; om-dwf9rgtvzzrhqylccaag=1634795585931; _omra={"dwf9rgtvzzrhqylccaag":"click"}; cookie_notice_accepted=true; ls_smartpush=fdfbe0ffe7800007',
}
while True:
response = requests.get(f'https://bewellstore.ro/shop/page/{page_number}/', headers=headers)
print(response.status_code)
print(response.url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.content, 'html.parser')
product_list = soup.find_all('div', class_= 'loop-product-inner')
# print (product_list)
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
print('Addedn link in product_links list :', link['href'])
product_items_list = []
i = 0
d = {}
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
downloaded = []
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i += 1
name = str(i) +'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str + img
else:
img_str = img_str + img + '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
page_number += 1
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)

Web Scraping Dynamic Pages - Adjusting the code

αԋɱҽԃ αмєяιcαη helped me in constructing this code for scraping reviews from this page where reviews are dynamically loaded. I then tried to adjust it so that it scrapes not just the comment-body, but also the commentors' names, dates, and ratings, and for the code to save the extracted data into an excel file. But I failed to do so. Could someone help me in adjusting the code correctly?
This is the code from αԋɱҽԃ αмєяιcαη
import requests
from bs4 import BeautifulSoup
import math
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
print(com.text[5:com.text.find(r"\n", 3)])
Main()
This is the code I adjusted but then got errors that I couldn't resolve
import requests
from bs4 import BeautifulSoup
import math
import pandas as pd
df = pd.DataFrame()
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
names = []
headers = []
bodies = []
ratings = []
published = []
updated = []
reported = []
dateElements = []
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"user-review\"'):
names.append(article.find('div', attrs={'class': 'name'}).text.strip())
try:
bodies.append(article.find('div', attrs={'class': 'comment-body'}).text.strip())
except:
bodies.append('NA')
try:
ratings.append(article.find('meta', attrs={'itemprop': 'ratingValue'})['content'])
except:
ratings.append('NA')
dateElements.append(article.find('div', attrs={'class': 'comment-date'}).text.strip())
print(com.text[5:com.text.find(r"\n", 3)])
temp_df = pd.DataFrame(
{'User Name': names, 'Body': bodies, 'Rating': ratings, 'Published Date': dateElements})
df = df.append(temp_df, sort=False).reset_index(drop=True)
Main()
df.to_csv('Allure10.csv', index=False, encoding='utf-8')
print ('excel done')

import requests
from bs4 import BeautifulSoup
import math
import csv
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
names = []
dates = []
comments = []
rating = []
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
comments.append(com.text[5:com.text.find(r"\n", 3)])
for name in soup.findAll("div", class_=r'\"name\"'):
names.append(name.text[:name.text.find(r"<\/div>", 1)])
for date in soup.findAll("div", class_=r'\"comment-date\"'):
dates.append(date.text[:date.text.find(r"<\/div>", 1)])
for rate in soup.findAll("meta", itemprop=r'\"ratingValue\"'):
rating.append(rate.get("content")[2:-3])
return zip(names, dates, rating, comments)
def Save():
data = Main()
with open("oka.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Dates", "Rating", "Comments"])
writer.writerows(data)
Save()
Output: check-online

beautifulsoup for loop extracts only first page data

I have a txt file with 2 urls in it
https://www.kununu.com/de/volkswagen/kommentare
https://www.kununu.com/de/audi/kommentare
I want to extract some data from all pages in that urls with beautifulsoup. Below code extracts that data but only for first page. I should be missing something, can you update code so, it will extract from all pages?
firma = []
lineList2 = [line.rstrip('\n') for line in open(r"C:/myfolder/555.txt")]
print(lineList2)
for url in lineList2:
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'{url}/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
firma.append(firmaText)
except:
firma.append('N/A')
page += 1
pagination = soup.find_all('div', {'class': 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({
'Company': firma
})
print(df)

from bs4 import BeautifulSoup
import requests
import pandas as pd
firma = []
lineList2=[]
with open('555.txt', 'r') as file:
lines = file.readlines()
for line in lines:
lineList2.append(line.strip('\n'))
print(lineList2)
for lurl in lineList2:
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print("in while")
print(f"Processing page {page}..")
url = f'{lurl}/{page}'
print(url)
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
firma.append(firmaText)
except:
firma.append('N/A')
page += 1
pagination = soup.find_all('div', {'class': 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({
'Company': firma
})
print(df)

Cannot web scrape a page which includes pagination and products in grid layout using python

I want to web scrape the following webpage
https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber=1
But I keep getting a part of the url links the first 12 of the first 2 pages, not the 3rd, and not the total links. I used the following:
initial_url = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10'
caturl = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber={}'
r = requests.get(initial_url)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'html.parser')
#GETTING THE LAST PAGE
last_page = soup.find('div', class_='pager').find('li', class_='next-page').a['href'].split('=')[1]
#GETTING THE PAGE URL LINKS
dept_page_url = [caturl.format(i) for i in range(1, int(last_page)+1)]
time.sleep(2)
for pageurl in dept_page_url:
r = requests.get(pageurl)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'html.parser')
#GETTING THE PRODUCT LINKS
productlist = soup.find('div',attrs={'class':'item-grid'})
atagslist = productlist.findAll('a', href=True)
links_with_text = []
final_links = []
for a in atagslist:
if a.text:
mlink = a['href']
if mlink !='#':
links_with_text.append(infodomain+mlink)
#DELETE DUPLICATES
links_with_text = list(dict.fromkeys(links_with_text))
links_with_text.extend(links_with_text)
How can I get all URL links

You could mimic the POST request the page makes and have an exit condition based on next-page being present
import requests
from bs4 import BeautifulSoup as bs
headers = {
'user-agent': 'Mozilla/5.0',
'content-type': 'application/json; charset=UTF-8',
'authority': 'www.websupplies.gr',
'x-requested-with': 'XMLHttpRequest'
}
links = []
page = 1
with requests.Session() as s:
while True:
data = '{"categoryId":"405","manufacturerId":"0","vendorId":"0","priceRangeFilterModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SelectedPriceRange":{},"MinPrice":"204","MaxPrice":"3850"},"specificationFiltersModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SpecificationFilterGroups":[{"Id":"658","FilterItems":[{"Id":"4821","FilterItemState":"Unchecked"},{"Id":"1969","FilterItemState":"Unchecked"},{"Id":"4394","FilterItemState":"Unchecked"},{"Id":"1971","FilterItemState":"Unchecked"},{"Id":"5459","FilterItemState":"Unchecked"},{"Id":"1953","FilterItemState":"Unchecked"},{"Id":"1962","FilterItemState":"Unchecked"},{"Id":"1963","FilterItemState":"Unchecked"}]},{"Id":"900","FilterItems":[{"Id":"2503","FilterItemState":"Unchecked"},{"Id":"2504","FilterItemState":"Unchecked"},{"Id":"2505","FilterItemState":"Unchecked"}]},{"Id":"944","FilterItems":[{"Id":"2715","FilterItemState":"Unchecked"},{"Id":"2714","FilterItemState":"Unchecked"}]},{"Id":"980","FilterItems":[{"Id":"2994","FilterItemState":"Unchecked"},{"Id":"2835","FilterItemState":"Unchecked"},{"Id":"2836","FilterItemState":"Unchecked"},{"Id":"4381","FilterItemState":"Unchecked"}]},{"Id":"988","FilterItems":[{"Id":"2882","FilterItemState":"Unchecked"},{"Id":"2883","FilterItemState":"Unchecked"},{"Id":"2989","FilterItemState":"Unchecked"}]},{"Id":"901","FilterItems":[{"Id":"2520","FilterItemState":"Unchecked"},{"Id":"2521","FilterItemState":"Unchecked"},{"Id":"2512","FilterItemState":"Unchecked"},{"Id":"2611","FilterItemState":"Unchecked"},{"Id":"2513","FilterItemState":"Unchecked"},{"Id":"5995","FilterItemState":"Unchecked"},{"Id":"2970","FilterItemState":"Unchecked"},{"Id":"2530","FilterItemState":"Unchecked"},{"Id":"5996","FilterItemState":"Unchecked"}]},{"Id":"986","FilterItems":[{"Id":"2971","FilterItemState":"Unchecked"},{"Id":"2872","FilterItemState":"Unchecked"},{"Id":"2871","FilterItemState":"Unchecked"},{"Id":"4995","FilterItemState":"Unchecked"},{"Id":"5009","FilterItemState":"Unchecked"}]},{"Id":"761","FilterItems":[{"Id":"4358","FilterItemState":"Unchecked"},{"Id":"4359","FilterItemState":"Unchecked"},{"Id":"4361","FilterItemState":"Unchecked"},{"Id":"5460","FilterItemState":"Unchecked"},{"Id":"4362","FilterItemState":"Unchecked"},{"Id":"4822","FilterItemState":"Unchecked"},{"Id":"4371","FilterItemState":"Unchecked"}]},{"Id":"917","FilterItems":[{"Id":"4826","FilterItemState":"Unchecked"},{"Id":"4825","FilterItemState":"Unchecked"},{"Id":"5357","FilterItemState":"Unchecked"},{"Id":"4827","FilterItemState":"Unchecked"},{"Id":"5345","FilterItemState":"Unchecked"},{"Id":"4828","FilterItemState":"Unchecked"}]},{"Id":"911","FilterItems":[{"Id":"4843","FilterItemState":"Unchecked"},{"Id":"4845","FilterItemState":"Unchecked"},{"Id":"4850","FilterItemState":"Unchecked"},{"Id":"4851","FilterItemState":"Unchecked"},{"Id":"5891","FilterItemState":"Unchecked"},{"Id":"5892","FilterItemState":"Unchecked"},{"Id":"5291","FilterItemState":"Unchecked"},{"Id":"6011","FilterItemState":"Unchecked"},{"Id":"6552","FilterItemState":"Unchecked"},{"Id":"6949","FilterItemState":"Unchecked"}]}]},"attributeFiltersModel7Spikes":null,"manufacturerFiltersModel7Spikes":{"CategoryId":"405","ManufacturerFilterItems":[{"Id":"268","FilterItemState":"Unchecked"},{"Id":"63","FilterItemState":"Unchecked"},{"Id":"191","FilterItemState":"Unchecked"},{"Id":"9","FilterItemState":"Unchecked"},{"Id":"330","FilterItemState":"Unchecked"},{"Id":"5","FilterItemState":"Unchecked"}]},"vendorFiltersModel7Spikes":null,"pageNumber":"'+ str(page) + '","orderby":"10","viewmode":"list","pagesize":"48","queryString":"","shouldNotStartFromFirstPage":true,"onSaleFilterModel":null,"keyword":"","searchCategoryId":"0","searchManufacturerId":"0","priceFrom":"","priceTo":"","includeSubcategories":"False","searchInProductDescriptions":"False","advancedSearch":"False","isOnSearchPage":"False"}'
r = s.post('https://www.websupplies.gr/getFilteredProducts', headers=headers,data=data)
soup = bs(r.content, 'lxml')
links.append([item['href'] for item in soup.select('.product-title a')])
page+=1
if soup.select_one('.next-page') is None:
break
base = 'https://www.websupplies.gr'
final_list = {base + item for i in links for item in i}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

dataframe from 2 classes beautifulsoup - python

Related

How to get content by table row and if non applicable append "None"?

Why does scraper stop at the 12th image?

Web Scraping Dynamic Pages - Adjusting the code

beautifulsoup for loop extracts only first page data

Cannot web scrape a page which includes pagination and products in grid layout using python

Categories

Resources