web scraping python for car search - python

I wanted to look for Skoda 2018 with less than 100K KM from this site
https://www.autocenter.co.il/
however I cannot find the right method
here is what I did
from bs4 import BeautifulSoup
import requests
url = "https://www.autocenter.co.il/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print((response.status_code))
first=soup.find_all("div",{"class":"product-wrapper-inner"})
print(first[0].text)

As mentioned in the comments by #Elyes construct your url based on your criteria
from bs4 import BeautifulSoup
import requests
url = "https://www.autocenter.co.il/shop/?flr_manufacturer=196&flr_from_year=2018&flr_mileage_range=0-100000"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print((response.status_code))
first=soup.find_all("div",{"class":"col-lg-11 text-center px-4 px-lg-15"})
[i.text for i in first]
output:

Related

How to web scrape meta content - Python web scraping question

I want to only scrape the word "Automobile" not the entire line with the meta brackets.
Desired output: "Automobile"
Can you please tell me how to fix this? Thanks!
from bs4 import BeautifulSoup
import requests
import csv
URL = 'https://www.electrive.com/2022/02/13/skoda-reveals-uk-pricing-for-enyaq-coupe-iv-vrs/'
(response := requests.get(URL)).raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
category2 = soup.find('meta', property='article:section')
print(category2)
Output:
<meta content="Automobile" property="article:section"/>
Just add ['content'] to your soup object.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.electrive.com/2022/02/13/skoda-reveals-uk-pricing-for-enyaq-coupe-iv-vrs/'
(response := requests.get(URL)).raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
category2 = soup.find('meta', property='article:section')['content']
print(category2)
Output:
Automobile

remove html tags from string using bs4

I'm trying to make a program to read the price of bitcoin from a website. I used bs4 and was bale to get the section I was looking for but its surrounded by the html tags.
output: <div class="priceValue___11gHJ">$52,693.18</div>
I just want the price and i have tried the regex and lxml methods, but I keep getting errors
import requests
from bs4 import BeautifulSoup
#get url
url = "https://coinmarketcap.com/currencies/bitcoin/"
r = requests.get(url)
#parse html
soup = BeautifulSoup(r.content, 'html5lib')
#find div
find_div = soup.find('div', {"class": "priceValue___11gHJ"})
print(find_div)
You need to do .text:
import requests
from bs4 import BeautifulSoup
#get url
url = "https://coinmarketcap.com/currencies/bitcoin/"
r = requests.get(url)
#parse html
soup = BeautifulSoup(r.content, 'html5lib')
#find div
find_div = soup.find('div', {"class": "priceValue___11gHJ"})
print(find_div.text) # $52,693.18

How do I exclude certain beautifulsoup results that I don't want?

I am having issues trying to exclude results given from my beautiful soup program this is my code:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
I don't want to get the results that start with a "#" for example: #cite_ref-18
I have tried using for loops but I get this error message: KeyError: 0
You can use the str.startswith() method:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for tag in soup.find_all('a'):
link = tag.get('href')
if not str(link).startswith('#'):
print(link)
You can use CSS selector a[href]:not([href^="#"]). This will select all <a> tags with href= attribute but not the ones starting with # character:
import requests
from bs4 import BeautifulSoup
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.select('a[href]:not([href^="#"])'):
print(link['href'])

locating child element by BeautifulSoup

I am new to BeautifulSoup and I am praticing with little tasks. Here I try to get the "previous" link in this site. The html is
here
My code is
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = soup.find('div', id="comic")
url2 = result.find('ul', class_='comicNav').find('a', rel='prev').find('href')
But it shows NoneType.. I have read some posts about the child elements in html, and I tried some different things. But it still does not work.. Thank you for your help in advance.
Tou could use a CSS Selector instead.
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = soup.select('.comicNav a[rel~="prev"]')[0]
print(result)
if you want just the href change
result = soup.select('.comicNav a[rel~="prev"]')[0]["href"]
To get prev link.find ul tag and then find a tag. Try below code.
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
url2 = soup.find('ul', class_='comicNav').find('a',rel='prev')['href']
print(url2)
Output:
/2254/

Scraping Amazon products names

I am trying to gather the first two pages products names on Amazon based on seller name. When I request the page, it has all elements I need ,however, when I use BeautifulSoup - they are not being listed. Here is my code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0'}
res = requests.get("https://www.amazon.com/s?me=A3WE363L17WQR&marketplaceID=ATVPDKIKX0DER", headers=headers)
#print(res.text)
soup = BeautifulSoup(res.text, "html.parser")
soup.find_all("a",href=True)
The links of products are not listed. If the Amazon API gives this information, I am open to use it (please provide some examples of its usage). Thanks a lot in advance.
I have extracted product names from alt attribute. Is this as intended?
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.amazon.com/s?me=A3WE363L17WQR&marketplaceID=ATVPDKIKX0DER')
soup = bs(r.content, 'lxml')
items = [item['alt'] for item in soup.select('.a-link-normal [alt]')]
print(items)
Over two pages:
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.amazon.com/s?i=merchant-items&me=A3WE363L17WQR&page={}&marketplaceID=ATVPDKIKX0DER&qid=1553116056&ref=sr_pg_{}'
for page in range(1,3):
r = requests.get(url.format(page,page))
soup = bs(r.content, 'lxml')
items = [item['alt'] for item in soup.select('.a-link-normal [alt]')]
print(items)

Categories