How to scrape items having same class and id [duplicate]

How to scrape items having same class and id [duplicate] - python

This question already has answers here:
How to scrape phone no using python when it show after clicked
(2 answers)
Closed 2 years ago.
I want to scrape person name, location and phone no but all thses have same class and no id.Here is the link of that web page: https://hipages.com.au/connect/emcoelectricalservices Please guide me. than you!
here is mo code:
import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
try:
title = (soup.find('h1', class_="sc-AykKI",id=False).text)
except:
title = 'Empty Title'
print(title)
try:
contact_person = (soup.find('span', class_="kBpGee",id=False).text)
except:
contact_person = 'Empty Person'
print(contact_person)
try:
location = (soup.find('span', class_="kBpGee",id=False).text)
except:
location = 'Empty location'
print(location)
def main():
#get data of detail page
url = "https://hipages.com.au/connect/emcoelectricalservices"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()

Hi the below works:-
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 4 09:52:13 2020
#author: prakh
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
titles = []
persons = []
locations = []
try:
titles.append(soup.find('h1', class_="sc-AykKI",id=False).text)
except:
titles.append('Empty Title')
try:
persons.append(soup.findAll('span', class_="Contact__Item-sc-1giw2l4-2 kBpGee",id=False)[1].text)
except:
persons.append('Empty Person')
try:
locations.append(soup.findAll('span', class_="Contact__Item-sc-1giw2l4-2 kBpGee",id=False)[2].text)
except:
locations.append('Empty location')
final_df = pd.DataFrame(
{'Title': titles,
'contact_person': persons,
'location': locations
})
print(final_df)
def main():
#get data of detail page
url = "https://hipages.com.au/connect/emcoelectricalservices"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()

Related

Beautiful soup doesn't scrape the data from the "next" pages

I am trying to scrape airbnb data using BeautifulSoup and Pandas. I checked a lot of tutorials and found the one I followed. The step in which the soup should scrape the data from the next page is not working, out of 15 pages, it scrapes only the first 2 or 3 pages or sometimes even none (even if the URLs of the pages are correct).
I cannot seem to understand why this happens and how to solve it. Can someone help out?
import requests
import bs4
import pandas as pd
import numpy as np
import csv
import time
url = 'https://www.airbnb.it/s/Italy/homes?checkin=2021-08-01&checkout=2021-08-02'
def get_page(url):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(soup):
result = []
result.extend(soup.find_all("div", {"class": "_8ssblpx"}))
return result
def get_listing_title(listing):
for l in listing:
try:
return str(l.find('div', {'class': '_1tanv1h'}).text)
except:
return None
def get_listing_subtitle(listing):
for l in listing:
try:
return str(l.find('span', {'class': '_1whrsux9'}).text)
except:
return None
def get_listing_info(listing):
for l in listing:
try:
return str(l.find_all('div', {'class': '_3c0zz1'})[0].text.lower())
except:
return None
def find_next_page(page):
base_url = "https://www.airbnb.it"
try:
nextpage = base_url + get_page(url).find_all("div", attrs={"class": "_jro6t0"})[0].find("a", attrs={'class':'_za9j7e'})['href']
except:
nextpage = None
return nextpage
title = []
subtitle = []
info = []
while url is not None:
soup = get_page(url)
listings = get_listings(soup)
for l in listings:
title.append(get_listing_title(l))
subtitle.append(get_listing_subtitle(l))
info.append(get_listing_info(l))
time.sleep(5)
url = find_next_page(soup)
print(url)
airbnb_data = pd.DataFrame(data = {'title': title,
'subtitle': subtitle,
'info': info})
airbnb_data

selecting itemTitle in h1- beautifulsoup

import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server Responded: ', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
#price
#item
h1 = soup.find('h1', id='itemTitle')
print(h1)
def main():
url = "https://www.ebay.com/itm/New-Longines-Master-Collection-Automatic-40mm-White-Mens-Watch-L2-909-4-78-3/383525040495?hash=item594bdfb16f:g:vdIAAOSwytheqbKu"
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
hi please help me with how I can select the item name on e-bay. The item name is the title of the watch. I managed to get to the then to itemTitle.

Example
import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url=url)
if not response.ok:
print('Server Responded: ', response.status_code)
else:
soup = BeautifulSoup(response.text, features='html.parser')
return soup
def get_detail_data(soup):
h1 = soup.select("span.g-hdn")[0]
print(h1.next_sibling)
return h1
if __name__ == "__main__":
url = "https://www.ebay.com/itm/New-Longines-Master-Collection-Automatic-40mm-White-Mens-Watch-L2-909-4-78-3/383525040495?hash=item594bdfb16f:g:vdIAAOSwytheqbKu"
get_detail_data(get_page(url))
Prints out
New Longines Master Collection Automatic 40mm White Men's Watch L2.909.4.78.3

I am getting an error while scraping title urls using python

I wrote a code to scrape title URLs but I'm getting an error while extracting title urls so could you please guide me.
here is my code:
import requests
from bs4 import BeautifulSoup
# import pandas as pd
# import pandas as pd
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
# 1. html , 2. parser
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('a', class_="body_link_11")
except:
titles_link = []
# urls = [item.get('href') for item in titles_link]
print(titles_link)
def main():
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/" \
"searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
get_index_data(get_page(mainurl))
if __name__ == '__main__':
main()

If you want to get all the links try this:
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('a',class_="body_link_11")
except:
titles_link = []
else:
titles_link_output = []
for link in titles_link:
try:
item_id = link.attrs.get('item_id', None) # All titles with valid links will have an item_id
if item_id:
titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
except:
continue
print(titles_link_output)
def main():
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
get_index_data(get_page(mainurl))
main()
Output:
['http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3309/rec/3', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2425/rec/4', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/150/rec/5', 'http://cgsc.cdmhost.com/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6', 'http://cgsc.cdmhost.com/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3672/rec/8', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3407/rec/9', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/4393/rec/10', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3445/rec/11', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3668/rec/12', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3703/rec/13', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2952/rec/14', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2898/rec/15', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3502/rec/16', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3553/rec/17', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/4052/rec/18', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3440/rec/19', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3583/rec/20']

Using BeautifulSoup to find links related to specific keyword

I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()

You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.

You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]

I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)

web-scraping with python 3.x and I use pytharm

I want to get incubator information by web-scraping, and I use python.but I get nothing after running my code. Here are my code.Need your help!
import requests
from requests.exceptions import RequestException
import re
def get_one_page(url):
try:
r = requests.get(url)
if r.status_code == 200:
return r.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('f14px c-blue.*?><a.*?>(.*?)</a>.*?fn14px c-666>(.*?)</td>')
items = re.findall(pattern, html)
for item in items:
yield {
'name': item[0],
'address': item[1]
}
def main(offset):
url = 'http://www.cnfuhuaqi.com/couveuse/0-0-0-0-0-d%.aspx' % offset
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
if __name__ == '__main__':
for i in range(2, 72):
main(i)

Never parse html with regex, use an html parser such as BeautifulSoup. In your case, you only need to select the element with zjfw-list-con class and extract the tables inside it. The following will extract the image src url, the link and the description for 2 iterations (2 and 3):
from bs4 import BeautifulSoup
import requests
incubators = []
def extract_data(url):
print("get data from {}".format(url))
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
tables = soup.find_all("div", {"class":"zjfw-list-con"})[0].find_all("table")
for table in tables:
for subtable in table.find_all('table'):
items = subtable.find('tr').find_all('td')
item_tuple = (
items[0].find('img')['src'],
items[1].find('a')['href'],
items[2].text.strip()
)
print(item_tuple)
incubators.append(item_tuple)
url = 'http://www.cnfuhuaqi.com/couveuse/0-0-0-0-0-%d.aspx'
for i in range(2, 4):
extract_data(url % i)
print("the full list : ")
for i in incubators:
print(' '.join(i))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape items having same class and id [duplicate] - python

Related

Beautiful soup doesn't scrape the data from the "next" pages

selecting itemTitle in h1- beautifulsoup

I am getting an error while scraping title urls using python

Using BeautifulSoup to find links related to specific keyword

web-scraping with python 3.x and I use pytharm

Categories

Resources