I need to get only the first href of each product.
Can someone give me a hint? Now i get more than one href of each
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.roco.cc/'
headers = {
'UserAgent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
}
productlinks = []
for x in range(1,30):
r = requests.get(
f'https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?p={x}&verfuegbarkeit_status=41%2C42%2C43%2C45%2C44')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('li', class_='item product product-item')
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
print(len(productlinks))
Try:
import requests
from bs4 import BeautifulSoup
url = "https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?p={}"
productlinks = []
for p in range(1, 3): # <--- increase number of pages here
soup = BeautifulSoup(requests.get(url.format(p)).content, "html.parser")
for li in soup.select("li.product-item"):
productlinks.append(li.a["href"])
print(len(productlinks))
Prints:
48
Then:
print(*productlinks, sep="\n")
Prints:
https://www.roco.cc/ren/products/locomotives/steam-locomotives/62200-br-64-steam-locomotive.html
https://www.roco.cc/ren/products/locomotives/steam-locomotives/63280-dampflok-0310-der-db-mit-altbaukessel-in-stahlblauer-farbgebung.html
https://www.roco.cc/ren/products/locomotives/steam-locomotives/63281-dampflok-0310-der-db-mit-altbaukessel-in-stahlblauer-farbgebung.html
https://www.roco.cc/ren/products/locomotives/steam-locomotives/69281-br0310-express-train-steam-locomotive.html
https://www.roco.cc/ren/products/locomotives/steam-locomotives/63255-keine-anderung-der-dome.html
https://www.roco.cc/ren/products/locomotives/steam-locomotives/69255-dampflok-935-ac.html
https://www.roco.cc/ren/products/locomotives/steam-locomotives/63256-935-12-series-steam-locomotive-of-drg---1-%cc%81d1-%cc%81h2-design.html
https://www.roco.cc/ren/products/locomotives/steam-locomotives/69256-935-12-series-steam-locomotive-of-drg1%c2%b4d1%c2%b4h2-design.html
...and so on.
Related
I'm trying to get all the category, sub category and sub sub category and so on of authors URL from dmoz website using BeautifulSoup.
I'm getting the following output:
# Missing the every 2nd option/URL in first step
/Arts/Literature/Authors/A
/Arts/Literature/Authors/C
/Arts/Literature/Authors/E
/Arts/Literature/Authors/G
/Arts/Literature/Authors/Horror
. . .
# Missing the every 1st option/URL in second step
/Arts/Literature/Authors/A/Abbey,_Lynn
/Top/Arts/Literature/Authors/A/Abe,_Kobo
In the above code 2nd element is missing in 1st step and 1st element in 2nd step.
Here is my code:
scrape_url = "http://dmoz.org/Arts/Literature/Authors"
page = session.get(scrape_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# get all the root category author list
for test in find_row:
if test.find('div', attrs = {'class':'panel-body'}):
test_link = test.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
# now get the sub or sub-sub category author URL list
for cat in sub_cat:
scrape_cat_url = "http://dmoz.org%s" % (cat)
print('scraping...', scrape_cat_url)
page = session.get(scrape_cat_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# if sub category go next level or restart
for row in find_row:
if row.find('div', attrs = {'class':'panel-body'}):
test_link = row.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
records.append(scrape_cat_url)
else:
records.append(scrape_cat_url)
# remove the category url from the sub_cat list
sub_cat.remove(cat)
Can anybody suggest a better way to get all the category, sub category and sub sub category URL of authors?
Try this streamlined version of your code:
from bs4 import BeautifulSoup
headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
scrape_url = "http://dmozlive.com/Top/Arts/Literature/Authors"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
cats = []
for row in find_rows:
links = row.find_all('a')
for link in links:
cats.append(link['href'])
cats
Print out:
['/Top/Arts/Literature/Authors/A',
'/Top/Arts/Literature/Authors/B',
'/Top/Arts/Literature/Authors/C',
'/Top/Arts/Literature/Authors/D',
'/Top/Arts/Literature/Authors/E',
'/Top/Arts/Literature/Authors/F',
…
Now get the subcategories:
sub_cats = []
for cat in cats:
scrape_url = f"http://dmozlive.com{cat}"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
for row in find_rows:
links = row.find_all('a')
for link in links:
sub_cats.append(link['href'])
subcats
Print out:
['/Top/Arts/Literature/Authors/A/Abbey,_Edward',
'/Top/Arts/Literature/Authors/A/Abbey,_Lynn',
'/Top/Arts/Literature/Authors/A/Abbott,_Edwin_A.',
'/Top/Arts/Literature/Authors/A/Abe,_Kobo',
'/Top/Arts/Literature/Authors/A/Achebe,_Chinua',
'/Top/Arts/Literature/Authors/A/Ackroyd,_Peter',
'/Top/Arts/Literature/Authors/A/Adams,_Douglas',
…
The following code may meet your expectation that's pull all the categories and sub-categories urls.
import requests
from bs4 import BeautifulSoup
url= 'http://dmozlive.com/Top/Arts/Literature/Authors'
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"}
req=requests.get(url,headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
for cat_url in soup.select('.list-group.col-md-6 a'):
cat_url = 'http://dmozlive.com' + cat_url.get('href')
#print(cat_url)
req2=requests.get(cat_url,headers=headers)
soup2 = BeautifulSoup(req2.text, 'html.parser')
for author_url in soup2.select('.list-group-item'):
author_url= 'http://dmozlive.com' + str(author_url.get('href'))
print(author_url)
How to get all 'href' with soup in python ? I try so many times but in vain.
Whatever I use 'soup.find' or 'soup.find_all' method to strugle for the 'href', it doesn't work.
python version:3.10
!pip install requests
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = "https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.find_all('ul',class_="searchItem Stype"):
print(len(link))
Link = link.li.a
LINK = Link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
sorry i misunderstood totally your problem. find_all is not a very versatile tool and you were searching for the wrong ul
i barely changed your code but it seems to work now
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = f"https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.select('ul#surveyContent > li > a[href]:first-of-type'):
print(len(link))
# ~ Link = link.li.a
LINK = link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
for page in range(1,2):
url = "https://m.momomall.com.tw/m/store/DCategory.jsp?entp_code=103487&category_code=all&orderby=3&page={}".format(page)
r = requests.get(url,headers = headers)
soup = BeautifulSoup(r.text,'lxml')
for goods_code in soup.select('a.nofollowBtn_star'):
Goods_code = 'https://www.momomall.com.tw/s/103487/'+goods_code.get('goods_code')+'/'
goodlink.append(Goods_code)
for URL in goodlink:
R = requests.get(URL, headers = headers)
Soup = BeautifulSoup(R.text,"lxml")
for dataprice in Soup.select('script'):
import re
discount_regex=re.compile('discountPrice = (\d{1,5})')
print(re.search(discount_regex, dataprice).group(1))
```
So i have taken the title of the medicines from this link : Medicines List
now i want to get the content for every medicines meanwhile every medicines has it owns link
Example :
Medicines Example
how can I get the content of that medicines using BeautifulSoup4 and requests library?
import requests
from bs4 import BeautifulSoup
from pprint import pp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
title = [x.text for x in soup.select(
'a[class$=section__item-link]')]
count = 0
for x in range (0, len(title)):
count += 1
print("{0}. {1}\n".format(count, title[x]))
main('https://www.klikdokter.com/obat')
Based on what I can see as the response from https://www.klikdokter.com/obat you should be able to do something like this:-
import requests
from bs4 import BeautifulSoup
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
BASEURL = 'https://www.klikdokter.com/obat'
headers = {'User-Agent': AGENT}
response = requests.get(BASEURL, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup.find_all('a', class_='topics-index--section__item-link'):
href = tag.get('href')
if href is not None:
print(href)
response = requests.get(href, headers=headers)
response.raise_for_status()
""" Do your processing here """
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
import re
base_url = 'https://www.brambleco.com/c/furniture/cabinets-sideboards'
headers = {
'User-Agents' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
cab_list = []
for x in range(1,5):
r = requests.get(f'https://www.brambleco.com/c/furniture/cabinets-sideboards/page/{x}')
soup = BeautifulSoup(r.content, 'lxml')
product = soup.find_all('div', class_='new-product-images')
igg=[]
for img in product:
for ig in img.find_all('img',{"data-original":True}):
igg.append(ig)
#print(igg)
for customize in product:
for cabinet in customize.find_all('a', href = True):
#cab = [item.strip() for item in cabinet if str(item)]
#print(cabinet['href'])
m = cabinet['href']
n = m.strip()
cab_list.append(n)
for cabinet in cab_list:
r = requests.get(cabinet, headers=headers)
soup = BeautifulSoup(r.content,'lxml')
name = soup.find('h1', {'id':'product-introduction-name'}).text
#print(name)
table = soup.find("table", {"class": "dimension imperial"})
cab = table.find_all('tr')
#print(cab)
tab = []
for result in cab:
try:
width = result.find('span', attrs={'class':'cm'}).text.strip()
depth = result.find('span', attrs={'class':'inch'}).text.strip()
tab.append((width, depth))
except:
None
#print(tab)
tab2 = []
table = soup.find("table", {"class": "product-attribute-specs"})
for ward in table:
try:
detail = ward.find('td').text.strip()
tab2.append(detail)
except:
None
tab.append(tab2)
print(tab)
I am trying to extract some information about an App on Google Play and BeautifulSoup doesn't seem to work.
The link is this(say):
https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts
My code:
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html)
l = soup.find_all("div", { "class" : "document-subtitles"})
print len(l)
0 #How is this 0?! There is clearly a div with that class
I decided to go all in, didn't work either:
i = soup.select('html body.no-focus-outline.sidebar-visible.user-has-no-subscription div#wrapper.wrapper.wrapper-with-footer div#body-content.body-content div.outer-container div.inner-container div.main-content div div.details-wrapper.apps.square-cover.id-track-partial-impression.id-deep-link-item div.details-info div.info-container div.info-box-top')
print i
What am I doing wrong?
You need to pretend to be a real browser by supplying the User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
html = r.content
soup = BeautifulSoup(html, "html.parser")
title = soup.find(class_="id-app-title").get_text()
rating = soup.select_one(".document-subtitle .star-rating-non-editable-container")["aria-label"].strip()
print(title)
print(rating)
Prints the title and the current rating:
Weird Facts
Rated 4.3 stars out of five stars
To get the additional information field values, you can use the following generic function:
def get_info(soup, text):
return soup.find("div", class_="title", text=lambda t: t and t.strip() == text).\
find_next_sibling("div", class_="content").get_text(strip=True)
Then, if you do:
print(get_info(soup, "Size"))
print(get_info(soup, "Developer"))
You will see printed:
1.4M
Email email#here.com