beautifulsoup for loop extracts only first page data

beautifulsoup for loop extracts only first page data - python

I have a txt file with 2 urls in it
https://www.kununu.com/de/volkswagen/kommentare
https://www.kununu.com/de/audi/kommentare
I want to extract some data from all pages in that urls with beautifulsoup. Below code extracts that data but only for first page. I should be missing something, can you update code so, it will extract from all pages?
firma = []
lineList2 = [line.rstrip('\n') for line in open(r"C:/myfolder/555.txt")]
print(lineList2)
for url in lineList2:
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'{url}/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
firma.append(firmaText)
except:
firma.append('N/A')
page += 1
pagination = soup.find_all('div', {'class': 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({
'Company': firma
})
print(df)

from bs4 import BeautifulSoup
import requests
import pandas as pd
firma = []
lineList2=[]
with open('555.txt', 'r') as file:
lines = file.readlines()
for line in lines:
lineList2.append(line.strip('\n'))
print(lineList2)
for lurl in lineList2:
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print("in while")
print(f"Processing page {page}..")
url = f'{lurl}/{page}'
print(url)
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
firma.append(firmaText)
except:
firma.append('N/A')
page += 1
pagination = soup.find_all('div', {'class': 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({
'Company': firma
})
print(df)

Related

How to get content by table row and if non applicable append "None"?

I have a problem when iterating through many links. I scrape according to columns css selector. however, as it seems there is among all links not a rating for every player. How do I manage that I get a "None" for the home_rating list when there is among the eleven starting squad no rating available in a specific "player row".
I basically need to scrape all column entries per row. thanks a lot for your support.
gamedays_url = range(1,35)
url_list = []
daylinks = []
for gameday in gamedays_url:
url = "https://www.transfermarkt.de/premier-league/spieltag/wettbewerb/L1/plus/?saison_id=2018&spieltag=" + str(gameday)
url_list.append(url)
response = requests.get(url, headers={'User-Agent': 'Custom5'})
gameLinks = []
for i in range(len(url_list)):
page = url_list
tree = requests.get(page[i], headers = {'User-Agent': 'Custom5'})
soup_2 = BeautifulSoup(tree.content, 'html.parser')
links_2 = soup_2.find_all("a", {"class": "liveLink"}, href=re.compile("spielbericht"))
for j in range(len(links_2)):
gameLinks.append(links_2[j].get('href').split('/')[4])
for j in range(len(gameLinks)):
gameLinks[j] = "https://www.transfermarkt.de/spiele/aufstellung/spielbericht/" + gameLinks [j]
home_id = []
home_name = []
homerating = []
for p in range(len(gameLinks)):
page = gameLinks[p]
response = requests.get(page, headers={'User-Agent': 'Custom5'})
lineup_data = response.text
soup = BeautifulSoup(lineup_data, 'html.parser')
test =soup.find('div', class_='responsive-table')
for homeid in test.find_all('a', href=re.compile('profil/spieler')):
home_id.append(homeid.get('href').split('/')[4])
for homename in test.find_all('a', href=re.compile('profil/spieler')):
home_name.append(homename.get('href').split('/')[1])
for grade in test.find_all('span', class_=None):
homerating.append(grade.text.split()[0])
homerating.append(None)

Try to check if your selected element is available and scrape the text alse set it to None:
row.select_one('span:not([class])').get_text(strip=True) if row.select('span:not([class])') else None
Also try to work with structured dicts instead of list.
Example
import requests
from bs4 import BeautifulSoup
data = []
for gameday in range(1,3):
url = "https://www.transfermarkt.de/premier-league/spieltag/wettbewerb/L1/plus/?saison_id=2018&spieltag=" + str(gameday)
response = requests.get(url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(response.content)
for a in soup.select('a.liveLink[href*="spielbericht"]'):
report_url = 'https://www.transfermarkt.de/spiele/aufstellung/spielbericht/'+a.get('href').split('/')[-1]
response = requests.get(report_url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(response.text)
for row in soup.select_one('table.items').select('tr:has(table)'):
data.append({
'home_id': row.select_one('a').get('href').split('/')[-1],
'home_name': row.select_one('a img').get('title'),
'home_rating': row.select_one('span:not([class])').get_text(strip=True) if row.select('span:not([class])') else None
})
data
Output
[...{'home_id': '45672', 'home_name': 'Kevin Trapp', 'home_rating': '3,4'},{'home_id': '256866', 'home_name': 'Carlos Salcedo', 'home_rating': None},{'home_id': '58178', 'home_name': 'David Abraham', 'home_rating': '3,4'}, {'home_id': '146258', 'home_name': 'Jetro Willems', 'home_rating': '5,5'},...]

Save dictionary in Excel with Python

I need your help to save the data in Excel. I`ve parsed some site and I need to input dictionary in Excel.
from scrapingbee import ScrapingBeeClient
import requests
from bs4 import BeautifulSoup
import pandas as pd
SCRAPINGBEE_API_KEY = "bzzzz"
endpoint = "https://app.scrapingbee.com/api/v1"
pages = [
'https://www.businesslist.com.ng/category/restaurants/1/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/2/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/3/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/4/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/5/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/6/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/7/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/8/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/9/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/10/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/11/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/12/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/13/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/14/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/15/city:lagos'
]
rest = []
#GET_LINKS
for url in pages[:1]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all('h4')
for items in body:
item = items.find('a').get('href')
item_link = 'https://www.businesslist.com.ng' + item
rest.append(item_link)
#GET_REST
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
df = pd.DataFrame(info)
df.to_excel('./Lagos.xlsx')
I get the link to parse from list 'rest', then get data from this link. Then I want to save each item from all link to dictionary 'info'. Then save it to Excel file. But code is saving the one line to file, not the all. I`ve missed something.

You are saving df inside the loop with same name it will create only one dict(means each loop value in excel). so you better create a empty dataframe outside the loop and save it into excel file after the loop execution completed.
Your altered code will be like
all_info = pd.DataFrame()
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
if len(all_info) == 0:
all_info = pd.DataFrame(info, index=range(len(info)))
else:
all_info = all_info.append(pd.DataFrame(info))
all_info.to_excel('./Lagos.xlsx')

How about creating a list with all the data, then converting that to a dataframe and then outputting that to an Excel file.
from scrapingbee import ScrapingBeeClient
import requests
from bs4 import BeautifulSoup
import pandas as pd
SCRAPINGBEE_API_KEY = "zzzzzzzzz"
endpoint = "https://app.scrapingbee.com/api/v1"
pages = [
'https://www.businesslist.com.ng/category/restaurants/1/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/2/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/3/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/4/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/5/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/6/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/7/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/8/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/9/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/10/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/11/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/12/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/13/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/14/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/15/city:lagos'
]
rest = []
#GET_LINKS
for url in pages[:1]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all('h4')
for items in body:
item = items.find('a').get('href')
item_link = 'https://www.businesslist.com.ng' + item
rest.append(item_link)
#GET_REST
data = []
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
data.append(info)
df = pd.DataFrame(data)
df.to_excel('./Lagos.xlsx')

Web Scraping Dynamic Pages - Adjusting the code

αԋɱҽԃ αмєяιcαη helped me in constructing this code for scraping reviews from this page where reviews are dynamically loaded. I then tried to adjust it so that it scrapes not just the comment-body, but also the commentors' names, dates, and ratings, and for the code to save the extracted data into an excel file. But I failed to do so. Could someone help me in adjusting the code correctly?
This is the code from αԋɱҽԃ αмєяιcαη
import requests
from bs4 import BeautifulSoup
import math
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
print(com.text[5:com.text.find(r"\n", 3)])
Main()
This is the code I adjusted but then got errors that I couldn't resolve
import requests
from bs4 import BeautifulSoup
import math
import pandas as pd
df = pd.DataFrame()
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
names = []
headers = []
bodies = []
ratings = []
published = []
updated = []
reported = []
dateElements = []
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"user-review\"'):
names.append(article.find('div', attrs={'class': 'name'}).text.strip())
try:
bodies.append(article.find('div', attrs={'class': 'comment-body'}).text.strip())
except:
bodies.append('NA')
try:
ratings.append(article.find('meta', attrs={'itemprop': 'ratingValue'})['content'])
except:
ratings.append('NA')
dateElements.append(article.find('div', attrs={'class': 'comment-date'}).text.strip())
print(com.text[5:com.text.find(r"\n", 3)])
temp_df = pd.DataFrame(
{'User Name': names, 'Body': bodies, 'Rating': ratings, 'Published Date': dateElements})
df = df.append(temp_df, sort=False).reset_index(drop=True)
Main()
df.to_csv('Allure10.csv', index=False, encoding='utf-8')
print ('excel done')

import requests
from bs4 import BeautifulSoup
import math
import csv
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
names = []
dates = []
comments = []
rating = []
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
comments.append(com.text[5:com.text.find(r"\n", 3)])
for name in soup.findAll("div", class_=r'\"name\"'):
names.append(name.text[:name.text.find(r"<\/div>", 1)])
for date in soup.findAll("div", class_=r'\"comment-date\"'):
dates.append(date.text[:date.text.find(r"<\/div>", 1)])
for rate in soup.findAll("meta", itemprop=r'\"ratingValue\"'):
rating.append(rate.get("content")[2:-3])
return zip(names, dates, rating, comments)
def Save():
data = Main()
with open("oka.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Dates", "Rating", "Comments"])
writer.writerows(data)
Save()
Output: check-online

dataframe from 2 classes beautifulsoup

I want to extract Arbeitsatmosphare rank and Stadt information based on review data from below website for ALL pages, so desired output should be as in example below
Arbeitsatmosphare | Stadt
1. 4.00 | Berlin
2. 5.00 | Frankfurt
3. 3.00 | Munich
4. 5.00 | Berlin
5. 4.00 | Berlin
Below code extracts pro data from website from all pages and works fine. I tried to update it and to add 2 lists, Arbeitsatmosphare rank and Stadt in it and break cycle if Arbeitsatmosphare rank info is missing, but my code is not working. Can you help?
pro = []
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
new_comments = [
pro.find_next_sibling('p').get_text()
for pro in soup.find_all('h2', text='Pro')
]
if not new_comments:
print(f"No more comments. Page: {page}")
break
pro += new_comments
print(pro)
#print(len(pro))
page += 1
print(pro)
UPD
Adding my code that is not working, however I think that there should be more simple solution
Arbeit = []
Stadt=[]
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
new_comments1 = [
Arbeit.find_next_sibling('span').get_text()
for Arbeit in soup.find_all('span', text='Arbeitsatmosphäre')
]
new_comments2 = [
Stadt.find_next_sibling('div').get_text()
for Stadt in soup.find_all('div', text='Stadt')
]
if not new_comments1:
print(f"No more comments. Page: {page}")
break
Arbeit += new_comments1
Stadt += new_comments2
print(Arbeit)
print(Stadt)
#print(len(pro))
page += 1

You can try:
import requests
from bs4 import BeautifulSoup
import pandas as pd
arbeit = []
firma = []
stadt = []
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'https://www.kununu.com/de/volkswagen/kommentare/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
rating_tags = article.find_all('span', {'class' : 'rating-badge'})
arbeit.append(rating_tags[0].text.strip())
detail_div = article.find_all('div', {'class' : 'review-details'})[0]
nodes = detail_div.find_all('li')
firma_node = nodes[0]
stadt_node = nodes[1]
firma_node_div = firma_node.find_all('div')
firma_name = firma_node_div[1].text.strip()
firma.append(firma_name)
stadt_node_div = stadt_node.find_all('div')
stadt_name = stadt_node_div[1].text.strip()
stadt.append(stadt_name)
page += 1
pagination = soup.find_all('div', {'class' : 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({'Arbeitsatmosphäre' : arbeit, 'Stadt' : stadt})
print(df)

Cannot web scrape a page which includes pagination and products in grid layout using python

I want to web scrape the following webpage
https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber=1
But I keep getting a part of the url links the first 12 of the first 2 pages, not the 3rd, and not the total links. I used the following:
initial_url = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10'
caturl = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber={}'
r = requests.get(initial_url)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'html.parser')
#GETTING THE LAST PAGE
last_page = soup.find('div', class_='pager').find('li', class_='next-page').a['href'].split('=')[1]
#GETTING THE PAGE URL LINKS
dept_page_url = [caturl.format(i) for i in range(1, int(last_page)+1)]
time.sleep(2)
for pageurl in dept_page_url:
r = requests.get(pageurl)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'html.parser')
#GETTING THE PRODUCT LINKS
productlist = soup.find('div',attrs={'class':'item-grid'})
atagslist = productlist.findAll('a', href=True)
links_with_text = []
final_links = []
for a in atagslist:
if a.text:
mlink = a['href']
if mlink !='#':
links_with_text.append(infodomain+mlink)
#DELETE DUPLICATES
links_with_text = list(dict.fromkeys(links_with_text))
links_with_text.extend(links_with_text)
How can I get all URL links

You could mimic the POST request the page makes and have an exit condition based on next-page being present
import requests
from bs4 import BeautifulSoup as bs
headers = {
'user-agent': 'Mozilla/5.0',
'content-type': 'application/json; charset=UTF-8',
'authority': 'www.websupplies.gr',
'x-requested-with': 'XMLHttpRequest'
}
links = []
page = 1
with requests.Session() as s:
while True:
data = '{"categoryId":"405","manufacturerId":"0","vendorId":"0","priceRangeFilterModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SelectedPriceRange":{},"MinPrice":"204","MaxPrice":"3850"},"specificationFiltersModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SpecificationFilterGroups":[{"Id":"658","FilterItems":[{"Id":"4821","FilterItemState":"Unchecked"},{"Id":"1969","FilterItemState":"Unchecked"},{"Id":"4394","FilterItemState":"Unchecked"},{"Id":"1971","FilterItemState":"Unchecked"},{"Id":"5459","FilterItemState":"Unchecked"},{"Id":"1953","FilterItemState":"Unchecked"},{"Id":"1962","FilterItemState":"Unchecked"},{"Id":"1963","FilterItemState":"Unchecked"}]},{"Id":"900","FilterItems":[{"Id":"2503","FilterItemState":"Unchecked"},{"Id":"2504","FilterItemState":"Unchecked"},{"Id":"2505","FilterItemState":"Unchecked"}]},{"Id":"944","FilterItems":[{"Id":"2715","FilterItemState":"Unchecked"},{"Id":"2714","FilterItemState":"Unchecked"}]},{"Id":"980","FilterItems":[{"Id":"2994","FilterItemState":"Unchecked"},{"Id":"2835","FilterItemState":"Unchecked"},{"Id":"2836","FilterItemState":"Unchecked"},{"Id":"4381","FilterItemState":"Unchecked"}]},{"Id":"988","FilterItems":[{"Id":"2882","FilterItemState":"Unchecked"},{"Id":"2883","FilterItemState":"Unchecked"},{"Id":"2989","FilterItemState":"Unchecked"}]},{"Id":"901","FilterItems":[{"Id":"2520","FilterItemState":"Unchecked"},{"Id":"2521","FilterItemState":"Unchecked"},{"Id":"2512","FilterItemState":"Unchecked"},{"Id":"2611","FilterItemState":"Unchecked"},{"Id":"2513","FilterItemState":"Unchecked"},{"Id":"5995","FilterItemState":"Unchecked"},{"Id":"2970","FilterItemState":"Unchecked"},{"Id":"2530","FilterItemState":"Unchecked"},{"Id":"5996","FilterItemState":"Unchecked"}]},{"Id":"986","FilterItems":[{"Id":"2971","FilterItemState":"Unchecked"},{"Id":"2872","FilterItemState":"Unchecked"},{"Id":"2871","FilterItemState":"Unchecked"},{"Id":"4995","FilterItemState":"Unchecked"},{"Id":"5009","FilterItemState":"Unchecked"}]},{"Id":"761","FilterItems":[{"Id":"4358","FilterItemState":"Unchecked"},{"Id":"4359","FilterItemState":"Unchecked"},{"Id":"4361","FilterItemState":"Unchecked"},{"Id":"5460","FilterItemState":"Unchecked"},{"Id":"4362","FilterItemState":"Unchecked"},{"Id":"4822","FilterItemState":"Unchecked"},{"Id":"4371","FilterItemState":"Unchecked"}]},{"Id":"917","FilterItems":[{"Id":"4826","FilterItemState":"Unchecked"},{"Id":"4825","FilterItemState":"Unchecked"},{"Id":"5357","FilterItemState":"Unchecked"},{"Id":"4827","FilterItemState":"Unchecked"},{"Id":"5345","FilterItemState":"Unchecked"},{"Id":"4828","FilterItemState":"Unchecked"}]},{"Id":"911","FilterItems":[{"Id":"4843","FilterItemState":"Unchecked"},{"Id":"4845","FilterItemState":"Unchecked"},{"Id":"4850","FilterItemState":"Unchecked"},{"Id":"4851","FilterItemState":"Unchecked"},{"Id":"5891","FilterItemState":"Unchecked"},{"Id":"5892","FilterItemState":"Unchecked"},{"Id":"5291","FilterItemState":"Unchecked"},{"Id":"6011","FilterItemState":"Unchecked"},{"Id":"6552","FilterItemState":"Unchecked"},{"Id":"6949","FilterItemState":"Unchecked"}]}]},"attributeFiltersModel7Spikes":null,"manufacturerFiltersModel7Spikes":{"CategoryId":"405","ManufacturerFilterItems":[{"Id":"268","FilterItemState":"Unchecked"},{"Id":"63","FilterItemState":"Unchecked"},{"Id":"191","FilterItemState":"Unchecked"},{"Id":"9","FilterItemState":"Unchecked"},{"Id":"330","FilterItemState":"Unchecked"},{"Id":"5","FilterItemState":"Unchecked"}]},"vendorFiltersModel7Spikes":null,"pageNumber":"'+ str(page) + '","orderby":"10","viewmode":"list","pagesize":"48","queryString":"","shouldNotStartFromFirstPage":true,"onSaleFilterModel":null,"keyword":"","searchCategoryId":"0","searchManufacturerId":"0","priceFrom":"","priceTo":"","includeSubcategories":"False","searchInProductDescriptions":"False","advancedSearch":"False","isOnSearchPage":"False"}'
r = s.post('https://www.websupplies.gr/getFilteredProducts', headers=headers,data=data)
soup = bs(r.content, 'lxml')
links.append([item['href'] for item in soup.select('.product-title a')])
page+=1
if soup.select_one('.next-page') is None:
break
base = 'https://www.websupplies.gr'
final_list = {base + item for i in links for item in i}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

beautifulsoup for loop extracts only first page data - python

Related

How to get content by table row and if non applicable append "None"?

Save dictionary in Excel with Python

Web Scraping Dynamic Pages - Adjusting the code

dataframe from 2 classes beautifulsoup

Cannot web scrape a page which includes pagination and products in grid layout using python

Categories

Resources