Can't get all links of defferent items - python

I've created a script to harvest the links of different products from a webpage. My intention is to scrape the links only when the products have Ajouter au panier sign, meaning Add to Basket. The html structures are very straightforward and easy to play with but the logic to get the desired links appears to be tricky. I've used three different links to show the variation.
Few urls lead to the desired products but still there are catalogues (if i make use of their links) which produces some more products. Check out the image links to see for yourself. I've drawn circles around the catalogues in the first image which can still produces the desired products whereas in that page few desired products are already there.
check out the variation
another one: only catalogues
This is the script I've written:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
urls = (
"https://www.directmedical.fr/categorie/aspirateurs-de-mucosite.html",
"https://www.directmedical.fr/categorie/literie.html",
"https://www.directmedical.fr/categorie/vetement.html"
)
def get_links(link):
r = requests.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".browseCategoryName a"):
ilink = urljoin(link,item.get("href"))
print(ilink)
if __name__ == '__main__':
for url in urls:
get_links(url)
How can I get all the products links having Ajouter au panier signs using those urls?

If you need to select Product links from both initial page and (if there are no products on initial page) from Category page, try
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
domain = "https://www.directmedical.fr/"
urls = (
"https://www.directmedical.fr/categorie/aspirateurs-de-mucosite.html",
"https://www.directmedical.fr/categorie/literie.html",
"https://www.directmedical.fr/categorie/vetement.html"
)
def get_links(link):
r = requests.get(link)
soup = BeautifulSoup(r.text, "lxml")
products = soup.select(".browseElements td > a")
if products:
for item in products:
ilink = urljoin(link, item.get("href"))
print(ilink)
else:
categories = [urljoin(domain, item.get("href")) for item in soup.select(".browseChildsCategorys td > a")]
for category in categories:
c = requests.get(category)
c_soup = BeautifulSoup(c.text, "lxml")
for item in c_soup.select(".browseElements td > a"):
c_link = urljoin(domain, item.get("href"))
print(c_link)
if __name__ == '__main__':
for url in urls:
get_links(url)

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

python crawling beautifulsoup how to crawl several pages?

Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()

Trouble parsing product names out of some links with different depth

I've written a script in python to reach the target page where each category has their avaiable item names in a website. My below script can get the product names from most of the links (generated through roving category links and then subcategory links).
The script can parse sub-category links revealed upon clicking + sign located right next to each category which are visible in the below image and then parse all the product names from the target page. This is one of such target pages.
However, few of the links do not have the same depth as other links. For example this link and this one are different from usual links like this one.
How can I get all the product names from all the links irrespective of their different depth?
This is what I've tried so far:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://www.courts.com.sg/"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".nav-dropdown li a"):
if "#" in item.get("href"):continue #kick out invalid links
newlink = urljoin(link,item.get("href"))
req = requests.get(newlink)
sauce = BeautifulSoup(req.text,"lxml")
for elem in sauce.select(".product-item-info .product-item-link"):
print(elem.get_text(strip=True))
How to find trget links:
The site has six main product categories. Products that belong to a subcategory can also be found in a main category (for example the products in /furniture/furniture/tables can also be found in /furniture), so you only have to collect products from the main categories. You could get the categories links from the main page, but it'd be easier to use the sitemap.
url = 'https://www.courts.com.sg/sitemap/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
cats = soup.select('li.level-0.category > a')[:6]
links = [i['href'] for i in cats]
As you've mentioned there are some links that have differend structure, like this one: /televisions. But, if you click the View All Products link on that page you will be redirected to /tv-entertainment/vision/television. So, you can get all the /televisions rpoducts from /tv-entertainment. Similarly, the products in links to brands can be found in the main categories. For example, the /asus products can be found in /computing-mobile and other categories.
The code below collects products from all the main categories, so it should collect all the products on the site.
from bs4 import BeautifulSoup
import requests
url = 'https://www.courts.com.sg/sitemap/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
cats = soup.select('li.level-0.category > a')[:6]
links = [i['href'] for i in cats]
products = []
for link in links:
link += '?product_list_limit=24'
while link:
r = requests.get(link)
soup = BeautifulSoup(r.text, 'html.parser')
link = (soup.select_one('a.action.next') or {}).get('href')
for elem in soup.select(".product-item-info .product-item-link"):
product = elem.get_text(strip=True)
products += [product]
print(product)
I've increased the number of products per page to 24, but still this code takes a lot of time, as it collects products from all main categories and their pagination links. However, we could make it much faster with the use of threads.
from bs4 import BeautifulSoup
import requests
from threading import Thread, Lock
from urllib.parse import urlparse, parse_qs
lock = Lock()
threads = 10
products = []
def get_products(link, products):
soup = BeautifulSoup(requests.get(link).text, 'html.parser')
tags = soup.select(".product-item-info .product-item-link")
with lock:
products += [tag.get_text(strip=True) for tag in tags]
print('page:', link, 'items:', len(tags))
url = 'https://www.courts.com.sg/sitemap/'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
cats = soup.select('li.level-0.category > a')[:6]
links = [i['href'] for i in cats]
for link in links:
link += '?product_list_limit=24'
soup = BeautifulSoup(requests.get(link).text, 'html.parser')
last_page = soup.select_one('a.page.last')['href']
last_page = int(parse_qs(urlparse(last_page).query)['p'][0])
threads_list = []
for i in range(1, last_page + 1):
page = '{}&p={}'.format(link, i)
thread = Thread(target=get_products, args=(page, products))
thread.start()
threads_list += [thread]
if i % threads == 0 or i == last_page:
for t in threads_list:
t.join()
print(len(products))
print('\n'.join(products))
This code collects 18,466 products from 773 pages in about 5 minutes. I'm using 10 threads because I don't want to stress the server too much, but you could use more (most servers can handle 20 threads easily).
I would recommend starting your scrape from the pages sitemap
Found here
If they were to add products, it's likely to show up here as well.
Since your main issue is finding the links, here is a generator that will find all of the category and sub-category links using the sitemap krflol pointed out in his solution:
from bs4 import BeautifulSoup
import requests
def category_urls():
response = requests.get('https://www.courts.com.sg/sitemap')
html_soup = BeautifulSoup(response.text, features='html.parser')
categories_sitemap = html_soup.find(attrs={'class': 'xsitemap-categories'})
for category_a_tag in categories_sitemap.find_all('a'):
yield category_a_tag.attrs['href']
And to find the product names, simply scrape each of the yielded category_urls.
I saw the website for parsing and found that all the products are available at the bottom left side of the main page https://www.courts.com.sg/ .After clicking one of these we goes to advertisement front page of a particular category. Where we have to go in click All Products for getting it.
Following is the code as whole:
import requests
from bs4 import BeautifulSoup
def parser():
parsing_list = []
url = 'https://www.courts.com.sg/'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
ul = soup.find('footer',{'class':'page-footer'}).find('ul')
for l in ul.find_all('li'):
nextlink = url + l.find('a').get('href')
response = requests.get(nextlink)
inner_soup = BeautifulSoup(response.text, "html.parser")
parsing_list.append(url + inner_soup.find('div',{'class':'category-static-links ng-scope'}).find('a').get('href'))
return parsing_list
This function will return list of all products of all categories which your code didn't scrape from it.

Scraping multiple paginated links with BeautifulSoup and Requests

Python Beginner here. I'm trying to scrape all products from one category on dabs.com. I've managed to scrape all products on a given page, but I'm having trouble iterating over all the paginated links.
Right now, I've tried to isolate all the pagination buttons with the span class='page-list" but even that isn't working. Ideally, I would like to make the crawler keep clicking next until it has scraped all products on all pages. How can I do this?
Really appreciate any input
from bs4 import BeautifulSoup
import requests
base_url = "http://www.dabs.com"
page_array = []
def get_pages():
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
page_list = soup.findAll('span', class="page-list")
pages = page_list[0].findAll('a')
for page in pages:
page_array.append(page.get('href'))
def scrape_page(page):
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
Product_table = soup.findAll("table")
Products = Product_table[0].findAll("tr")
if len(soup.findAll('tr')) > 0:
Products = Products[1:]
for row in Products:
cells = row.find_all('td')
data = {
'description' : cells[0].get_text(),
'price' : cells[1].get_text()
}
print data
get_pages()
[scrape_page(base_url + page) for page in page_array]
Their next page button has a title of "Next" you could do something like:
import requests
from bs4 import BeautifulSoup as bs
url = 'www.dabs.com/category/computing/11001/'
base_url = 'http://www.dabs.com'
r = requests.get(url)
soup = bs(r.text)
elm = soup.find('a', {'title': 'Next'})
next_page_link = base_url + elm['href']
Hope that helps.

Display all search results when web scraping with Python

I'm trying to scrape a list of URL's from the European Parliament's Legislative Observatory. I do not type in any search keyword in order to get all links to documents (currently 13172). I can easily scrape a list of the first 10 results which are displayed on the website using the code below. However, I want to have all links so that I would not need to somehow press the next page button. Please let me know if you know of a way to achieve this.
import requests, bs4, re
# main url of the Legislative Observatory's search site
url_main = 'http://www.europarl.europa.eu/oeil/search/search.do?searchTab=y'
# function gets a list of links to the procedures
def links_to_procedures (url_main):
# requesting html code from the main search site of the Legislative Observatory
response = requests.get(url_main)
soup = bs4.BeautifulSoup(response.text) # loading text into Beautiful Soup
links = [a.attrs.get('href') for a in soup.select('div.procedure_title a')] # getting a list of links of the procedure title
return links
print(links_to_procedures(url_main))
You can follow the pagination by specifying the page GET parameter.
First, get the results count, then calculate the number of pages to process by dividing the count on the results count per page. Then, iterate over pages one by one and collect the links:
import re
from bs4 import BeautifulSoup
import requests
response = requests.get('http://www.europarl.europa.eu/oeil/search/search.do?searchTab=y')
soup = BeautifulSoup(response.content)
# get the results count
num_results = soup.find('span', class_=re.compile('resultNum')).text
num_results = int(re.search('(\d+)', num_results).group(1))
print "Results found: " + str(num_results)
results_per_page = 50
base_url = "http://www.europarl.europa.eu/oeil/search/result.do?page={page}&rows=%s&sort=d&searchTab=y&sortTab=y&x=1411566719001" % results_per_page
links = []
for page in xrange(1, num_results/results_per_page + 1):
print "Current page: " + str(page)
url = base_url.format(page=page)
response = requests.get(url)
soup = BeautifulSoup(response.content)
links += [a.attrs.get('href') for a in soup.select('div.procedure_title a')]
print links

Categories