import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server Responded: ', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
#price
#item
h1 = soup.find('h1', id='itemTitle')
print(h1)
def main():
url = "https://www.ebay.com/itm/New-Longines-Master-Collection-Automatic-40mm-White-Mens-Watch-L2-909-4-78-3/383525040495?hash=item594bdfb16f:g:vdIAAOSwytheqbKu"
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
hi please help me with how I can select the item name on e-bay. The item name is the title of the watch. I managed to get to the then to itemTitle.
Example
import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url=url)
if not response.ok:
print('Server Responded: ', response.status_code)
else:
soup = BeautifulSoup(response.text, features='html.parser')
return soup
def get_detail_data(soup):
h1 = soup.select("span.g-hdn")[0]
print(h1.next_sibling)
return h1
if __name__ == "__main__":
url = "https://www.ebay.com/itm/New-Longines-Master-Collection-Automatic-40mm-White-Mens-Watch-L2-909-4-78-3/383525040495?hash=item594bdfb16f:g:vdIAAOSwytheqbKu"
get_detail_data(get_page(url))
Prints out
New Longines Master Collection Automatic 40mm White Men's Watch L2.909.4.78.3
Related
How I can get page with other language not default
My code:
import requests
from bs4 import BeautifulSoup
def scrape(page):
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
return soup
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232")
print(product)
and now when i open this page i get DE lang how I can get it with EN lang?
They havent other URL or prefix for language. Only changes in button.
Edit:
import re
import requests
from bs4 import BeautifulSoup
def scrape(page):
headers = {'Accept-Language': 'en-US,en;q=0.8'}
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
getPage= requests.get(url, headers=headers)
soup = BeautifulSoup(getPage.content, 'html.parser')
title=soup.select('.product-info-title-desktop')[0].text
return title
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232")
print(product)
Nothing change :/
Try it:
import re
import requests
from bs4 import BeautifulSoup
def scrape(page):
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
getPage= requests.get(url)
soup = BeautifulSoup(getPage.content, 'html.parser')
title=soup.select('.product-info-title-desktop')[0].text
return title
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232&language=en")
print(product)
I wrote a code to scrape title URLs but I'm getting an error while extracting title urls so could you please guide me.
here is my code:
import requests
from bs4 import BeautifulSoup
# import pandas as pd
# import pandas as pd
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
# 1. html , 2. parser
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('a', class_="body_link_11")
except:
titles_link = []
# urls = [item.get('href') for item in titles_link]
print(titles_link)
def main():
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/" \
"searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
get_index_data(get_page(mainurl))
if __name__ == '__main__':
main()
If you want to get all the links try this:
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('a',class_="body_link_11")
except:
titles_link = []
else:
titles_link_output = []
for link in titles_link:
try:
item_id = link.attrs.get('item_id', None) # All titles with valid links will have an item_id
if item_id:
titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
except:
continue
print(titles_link_output)
def main():
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
get_index_data(get_page(mainurl))
main()
Output:
['http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3309/rec/3', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2425/rec/4', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/150/rec/5', 'http://cgsc.cdmhost.com/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6', 'http://cgsc.cdmhost.com/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3672/rec/8', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3407/rec/9', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/4393/rec/10', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3445/rec/11', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3668/rec/12', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3703/rec/13', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2952/rec/14', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2898/rec/15', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3502/rec/16', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3553/rec/17', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/4052/rec/18', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3440/rec/19', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3583/rec/20']
This question already has answers here:
How to scrape phone no using python when it show after clicked
(2 answers)
Closed 2 years ago.
I want to scrape person name, location and phone no but all thses have same class and no id.Here is the link of that web page: https://hipages.com.au/connect/emcoelectricalservices Please guide me. than you!
here is mo code:
import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
try:
title = (soup.find('h1', class_="sc-AykKI",id=False).text)
except:
title = 'Empty Title'
print(title)
try:
contact_person = (soup.find('span', class_="kBpGee",id=False).text)
except:
contact_person = 'Empty Person'
print(contact_person)
try:
location = (soup.find('span', class_="kBpGee",id=False).text)
except:
location = 'Empty location'
print(location)
def main():
#get data of detail page
url = "https://hipages.com.au/connect/emcoelectricalservices"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
Hi the below works:-
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 4 09:52:13 2020
#author: prakh
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
titles = []
persons = []
locations = []
try:
titles.append(soup.find('h1', class_="sc-AykKI",id=False).text)
except:
titles.append('Empty Title')
try:
persons.append(soup.findAll('span', class_="Contact__Item-sc-1giw2l4-2 kBpGee",id=False)[1].text)
except:
persons.append('Empty Person')
try:
locations.append(soup.findAll('span', class_="Contact__Item-sc-1giw2l4-2 kBpGee",id=False)[2].text)
except:
locations.append('Empty location')
final_df = pd.DataFrame(
{'Title': titles,
'contact_person': persons,
'location': locations
})
print(final_df)
def main():
#get data of detail page
url = "https://hipages.com.au/connect/emcoelectricalservices"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
I am trying to count the number of links on a Web page using the following code:
import requests
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup
import pandas as pd
webpage = "https://www.isode.com/products/index.html"
try:
response = requests.get(webpage)
#response.raise_for_status()
except HTTPError:
print("A HTTP Error has occured")
except Exception as err:
print(err)
else:
print("The request of the webpage was a success!")
contents = response.content
contents
soup = BeautifulSoup(contents, features = "html.parser")
a = 0
for link in soup.find_all("a"):
if link.get("href"):
a=a+1
print(link.get("href")
My expected answer is 86 but this code is giving me 83, so I am lost about where I am going wrong?
Also, in terms of having a count variable - surely there is a better way to do this?
import requests
from bs4 import BeautifulSoup
links = []
with requests.Session() as req:
r = req.get('https://www.isode.com/products/index.html')
soup = BeautifulSoup(r.text, 'html.parser')
if r.status_code == 200:
for item in soup.findAll('a'):
item = item.get('href')
if item is not None:
links.append(item)
print(len(links))
Output:
83
But if you remove the condition of if item is not None: so you will get 86
Deep version:
import requests
from bs4 import BeautifulSoup
links = []
with requests.Session() as req:
r = req.get('https://www.isode.com/products/index.html')
soup = BeautifulSoup(r.text, 'html.parser')
if r.status_code == 200:
count = 0
for item in soup.findAll('a'):
item = item.get('href')
if item is not None:
if item.startswith('..'):
item = item.replace('..', 'https://www.isode.com')
elif item.startswith('http'):
pass
else:
item = (f"https://www.isode.com/"+item)
print(item)
links.append(item)
else:
count += 1
print(f"Total Links: {len(links)}")
print(f"Total None: {count}")
I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)