Scraping multiple page on this site HELP NEEDED - python

Hi I would love to be able to scrape multiple page for this website
Can some one give help on how i can scrape scrape through all the pages i am only able to get information from one page how ever I just get information from one page
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
for i in range(2000):
Centris ='https://www.centris.ca/en/commercial-units~for-rent~montreal-ville-marie/26349148?view=Summary'.format(i)
r = get(Centris, headers=headers)
soup = bs(r.text, 'html.parser')
results = soup.find_all('div', attrs={'id':'divMainResult'})
data = []
for result in results:
titre = result.find('span', attrs={'data-id': 'PageTitle'})
titre = [str(titre.string).strip() for titre in titre]
superficie = result.find('div', attrs={'class': 'carac-value'}, string=re.compile('sqft'))
superficie = [str(superficie.string).strip() for superficie in superficie]
emplacement = result.find_all('h2', attrs={'class': 'pt-1'})
emplacement = [str(emplacement.string).strip() for emplacement in emplacement]
prix = result.find_all('span', attrs={'class':'text-nowrap'})
prix = [(prix.text).strip('\w.') for prix in prix]
description = result.find_all('div', attrs={'itemprop': 'description'})
description = [str(description.string).strip() for description in description]
lien = result.find_all('a', attrs={'class': 'dropdown-item js-copy-clipboard'})

To get pagination working you can simulate Ajax requests with requests module:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.centris.ca/Property/GetInscriptions"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
}
json_data = {"startPosition": 0}
with requests.session() as s:
# load cookies:
s.get(
"https://www.centris.ca/en/commercial-units~for-rent?uc=0",
headers=headers,
)
for page in range(0, 100, 20): # <-- increase number of pages here
json_data["startPosition"] = page
data = s.post(url, headers=headers, json=json_data).json()
soup = BeautifulSoup(data["d"]["Result"]["html"], "html.parser")
for a in soup.select(".a-more-detail"):
print(a.select_one(".category").get_text(strip=True))
print(a.select_one(".address").get_text(strip=True, separator="\n"))
print("https://www.centris.ca" + a["href"])
print("-" * 80)
Prints:
Commercial unit for rent
6560, Avenue de l'Esplanade, suite 105
Montréal (Rosemont/La Petite-Patrie)
Neighbourhood La Petite-Patrie
https://www.centris.ca/en/commercial-units~for-rent~montreal-rosemont-la-petite-patrie/16168393?view=Summary
--------------------------------------------------------------------------------
Commercial unit for rent
75, Rue Principale
Gatineau (Aylmer)
Neighbourhood Vieux Aylmer, Des Cèdres, Marina
https://www.centris.ca/en/commercial-units~for-rent~gatineau-aylmer/22414903?view=Summary
--------------------------------------------------------------------------------
Commercial building for rent
53, Rue Saint-Pierre, suite D
Saint-Pie
https://www.centris.ca/en/commercial-buildings~for-rent~saint-pie/15771470?view=Summary
--------------------------------------------------------------------------------
...and so on.

Thank you so much I came up with this and it worked perfectly
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.centris.ca/Property/GetInscriptions"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
}
json_data = {"startPosition": 0}
with requests.session() as s:
Centris = []
# load cookies:
s.get(
"https://www.centris.ca/en/commercial-units~for-rent?uc=0",
headers=headers,
)
for page in range(0, 100, 20): # <-- increase number of pages here
json_data["startPosition"] = page
data = s.post(url, headers=headers, json=json_data).json()
soup = BeautifulSoup(data["d"]["Result"]["html"], "html.parser")
for a in soup.select(".a-more-detail"):
titre = a.select_one(".category").get_text(strip=True)
emplacement = a.select_one(".address").get_text(strip=True, separator="\n")
lien = "https://www.centris.ca" + a["href"]
prix = a.select_one(".price").get_text(strip=True)
Centris.append((titre, emplacement, lien, prix))
df = pd.DataFrame(Centris, columns={'Titre':titre, 'Emplacement':emplacement, 'Lien':lien, 'Prix':prix})
writer = pd.ExcelWriter('Centris.xlsx')
df.to_excel(writer)
writer.save()
print( 'Data Saved To excel' )

Related

Grabbing all data fields from a div in python beautifulsoup

The snippet below works fine not until the other day. Is there any way to extract all the data inside this div class="row mb-4" easily. What I am thinking is that if additional changes will be made to the page, still the script will not be affected.
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
url = "https://bscscan.com/token/"
token = "0x4ce1a5cb12151423ea479cfd0c52ec5021d108d8"
tokenurl = str(url)+str(token)
contractpage = requests.get(tokenurl,header)
ca = BeautifulSoup(contractpage.content, 'html.parser')
tokenholders = ca.find(id='ContentPlaceHolder1_tr_tokenHolders').get_text()
tokenholdersa = (((tokenholders.strip().strip("Holders:")).strip()).strip(" a ")).strip()
tholders = ((((tokenholders.strip()).strip("Holders:")).strip()).strip(" a ")).strip()
tokenaname = ca.find('span', class_='text-secondary small').get_text().strip()
def get_transfer_count(str:token)->str:
with requests.Session() as s:
s.headers = {'User-Agent':'Mozilla/5.0'}
r = s.get(f'https://bscscan.com/token/{token}')
try:
sid = re.search(r"var sid = '(.*?)'", r.text).group(1)
r = s.get(f'https://bscscan.com/token/generic-tokentxns2?m=normal&contractAddress={token}&a=&sid={sid}&p=1')
return re.search(r"var totaltxns = '(.*?)'", r.text).group(1)
except:
pass
transcount = get_transfer_count(token)
print ("Token: ", tokenaname)
print ("Holders: ", tholders)
print ("Transfers: ", transcount)
Previous Output:
Token: Binemon
Holders: 27,099
Transfers: 439,636
Wanted Improved Output:
Token: Binemon
PRICE: $0.01 # 0.000037 BNB (-22.41%)
Fully Diluted Market Cap: $14,011,783.50
Total Supply: 975,000,000 BIN
Holders: 27,099 addresses
Transfers: 439,636
Contract: 0xe56842ed550ff2794f010738554db45e60730371
Decimals: 18
Official Site: https://binemon.io/
Social Profiles:
https://twitter.com/binemonnft
https://t.me/binemonchat
https://docs.binemon.io/
https://coinmarketcap.com/currencies/binemon/
https://www.coingecko.com/en/coins/binemon/
Try:
import requests
from bs4 import BeautifulSoup
header = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0",
}
tokenurl = (
"https://bscscan.com/token/0x7083609fce4d1d8dc0c979aab8c869ea2c873402"
)
contractpage = requests.get(tokenurl, headers=header)
ca = BeautifulSoup(contractpage.content, "html.parser")
name = ca.h1.span.get_text(strip=True)
price = ca.select_one(".card-body .d-block").get_text(strip=True)
cap = ca.select_one("#pricebutton").get_text(strip=True)
print("Token:", name)
print("PRICE:", price)
print("Fully Diluted Market Cap:", cap)
print()
for c in ca.select(".row .col-md-8"):
pt = c.find_previous(class_="col-md-4").get_text(strip=True)
t = c.get_text(strip=True, separator=" ").split("(")[0]
if pt == "Social Profiles:":
links = [a["href"].strip() for a in c.select("a")]
print(pt, *links, sep="\n\t")
else:
print(pt, t)
Prints:
Token: Binance-Peg Polkadot Token
PRICE: $30.35# 0.079643 BNB(-10.39%)
Fully Diluted Market Cap: $485,657,455.49
Total Supply: 15,999,999.991309 DOT
Holders: 80,065 addresses
Transfers: -
Contract: 0x7083609fce4d1d8dc0c979aab8c869ea2c873402
Decimals: 18
Official Site: https://polkadot.network/
Social Profiles:
https://polkadot.network/blog
https://reddit.com/r/dot
https://twitter.com/polkadotnetwork
https://github.com/w3f
https://polkadot.network/PolkaDotPaper.pdf
https://coinmarketcap.com/currencies/polkadot-new/
https://www.coingecko.com/en/coins/polkadot/

how to get the content of a title using BeautifulSoup4 and requests

So i have taken the title of the medicines from this link : Medicines List
now i want to get the content for every medicines meanwhile every medicines has it owns link
Example :
Medicines Example
how can I get the content of that medicines using BeautifulSoup4 and requests library?
import requests
from bs4 import BeautifulSoup
from pprint import pp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
title = [x.text for x in soup.select(
'a[class$=section__item-link]')]
count = 0
for x in range (0, len(title)):
count += 1
print("{0}. {1}\n".format(count, title[x]))
main('https://www.klikdokter.com/obat')
Based on what I can see as the response from https://www.klikdokter.com/obat you should be able to do something like this:-
import requests
from bs4 import BeautifulSoup
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
BASEURL = 'https://www.klikdokter.com/obat'
headers = {'User-Agent': AGENT}
response = requests.get(BASEURL, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup.find_all('a', class_='topics-index--section__item-link'):
href = tag.get('href')
if href is not None:
print(href)
response = requests.get(href, headers=headers)
response.raise_for_status()
""" Do your processing here """

How do I capture the URL of each job so I can open full job description when looking at csv file

Can someone help me modify this script so that it also scraps the URL associated with each job. The purpose would be when browsing the .csv file in a spreadsheet I can click on the link if I would like to know more information about the job. Thank you in advance.
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('\n', '')
job = {
'title': title,
'company': company,
'salary': salary,
'summary': summary
}
joblist.append(job)
return
joblist = []
for i in range(0,90,10):
print(f'Getting page, {i}')
c = extract(0)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')
You can use one of this
url = 'https://www.indeed.com' + item.find('a')['href']
url = 'https://www.indeed.com' + item.find('a').get('href')
url = 'https://www.indeed.com' + item.find('a').attrs['href']
url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
BTW:
You always load the same page. To get next page you have to use start=... in url.
And you can do this more readable using dictionary and params= in requests
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': page,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
Working code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(start):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': start,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup, joblist):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
url = 'https://www.indeed.com' + item.find('a')['href']
#url = 'https://www.indeed.com' + item.find('a').get('href')
#url = 'https://www.indeed.com' + item.find('a').attrs['href']
#url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('\n', '')
joblist.append({
'title': title,
'url': url,
'company': company,
'salary': salary,
'summary': summary
})
# --- main ---
joblist = []
for start in range(0, 90, 10):
print('Getting page', start)
c = extract(start)
transform(c, joblist)
df = pd.DataFrame(joblist)
df.to_csv('jobs.csv')
print(df.head())

Extracting company name and other information inside all urls present in a webpage using beautifulsoup

<li>
<strong>Company Name</strong>
":"
<span itemprop="name">PT ERA MURNI BUSANA</span>
</li>
In the above HTML code, I am trying to extract the company name which is PT ERA MURNI BUSANA.
if I use a single test link, I can get the name using the single line code I wrote:
soup.find_all("span",attrs={"itemprop":"name"})[3].get_text()
But I want to extract the information from all such pages present in a single web page.
So I write the for loop but it is fetch the details. I am pasting the part of the code that I have been trying which needs some modification.
Code:-
for link in supplierlinks: #links have been extracted and merged with the base url
r=requests.get(link,headers=headers)
soup=BeautifulSoup(r.content,'lxml')
companyname=soup.find_all("span",attrs={"itemprop":"name"})[2].get_text()
Output looks like:
{'Company Name': 'AIRINDO SAKTI GARMENT PT'}
{'Company Name': 'Garments'}
{'Company Name': 'Garments'}
Instead of the garments popping up in the output, I need the company name. How do I modify the code within for loop?
Link:https://idn.bizdirlib.com/node/5290
Try this code:
import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'}
r = requests.get('https://idn.bizdirlib.com/node/5290',headers=headers).text
soup = BeautifulSoup(r,'html5lib')
print(soup.find_all("span",attrs={"itemprop":"name"})[-1].get_text())
div = soup.find('div',class_ = "content clearfix")
li_tags = div.div.find_all('fieldset')[1].find_all('div')[-1].ul.find_all('li')
supplierlinks = []
for li in li_tags:
try:
supplierlinks.append("https://idn.bizdirlib.com/"+li.a['href'])
except:
pass
for link in supplierlinks:
r = requests.get(link,headers=headers).text
soup = BeautifulSoup(r,'html5lib')
print(soup.find_all("span", attrs={"itemprop": "name"})[-1].get_text())
Output:
PT ERA MURNI BUSANA
PT ELKA SURYA ABADI
PT EMPANG BESAR MAKMUR
PT EMS
PT ENERON
PT ENPE JAYA
PT ERIDANI TOUR AND TRAVEL
PT EURO ASIA TRADE & INDUSTRY
PT EUROKARS CHRISDECO UTAMA
PT EVERAGE VALVES METAL
PT EVICO
This code prints the company names of all the links on the page
You can select sibling element to element <strong> that contains the text "Company Name" (also, don't forget to set User-Agent http header):
import requests
from bs4 import BeautifulSoup
url = 'https://idn.bizdirlib.com/node/5290'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
print( soup.select_one('strong:contains("Company Name") + *').text )
Prints:
PT ERA MURNI BUSANA
EDIT: To get contact person:
import requests
from bs4 import BeautifulSoup
url = 'https://idn.bizdirlib.com/node/5290'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
print( soup.select_one('strong:contains("Company Name") + *').text )
print( soup.select_one('strong:contains("Contact") + *').text )
Prints:
PT ERA MURNI BUSANA
Mr. Yohan Kustanto

Why can't I scrape Amazon products by BeautifulSoup?

I am trying to scrape the heading of this Amazon listing. The code I wrote is working for some other Amazon listings, but not working for the url mentioned in the code below.
Here is the python code I've tried:
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/BULLMER-Cotton-Printed-T-shirt-Multicolour/dp/B0892SZX7F/ref=sr_1_4?c=ts&dchild=1&keywords=Men%27s+T-Shirts&pf_rd_i=1968024031&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_p=8b97601b-3643-402d-866f-95cc6c9f08d4&pf_rd_r=EPY70Y57HP1220DK033Y&pf_rd_s=merchandised-search-6&qid=1596817115&refinements=p_72%3A1318477031&s=apparel&sr=1-4&ts_id=1968123031"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0"}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "html.parser")
#print(soup.prettify())
title = soup.find(id = "productTitle")
if title:
title = title.get_text()
else:
title = "default_title"
print(title)
Output:
200
default_title
html code from inspector tools:
<span id="productTitle" class="a-size-large product-title-word-break">
BULLMER Mens Halfsleeve Round Neck Printed Cotton Tshirt - Combo Tshirt - Pack of 3
</span>
First, As others have commented, use a proxy service. Second in order to go amazon product page if you have an asin that's enough.
Amazon follows this url pattern for all product pages.
https://www.amazon.(com/in/fr)/dp/<asin>
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/dp/B0892SZX7F"
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find("span", {"id":"productTitle"})
if title:
title = title.get_text(strip=True)
else:
title = "default_title"
print(title)
Output:
200
BULLMER Mens Halfsleeve Round Neck Printed Cotton Tshirt - Combo Tshirt - Pack of 3
this worked fine for me:
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/BULLMER-Cotton-Printed-T-shirt-Multicolour/dp/B0892SZX7F/ref=sr_1_4?c=ts&dchild=1&keywords=Men%27s+T-Shirts&pf_rd_i=1968024031&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_p=8b97601b-3643-402d-866f-95cc6c9f08d4&pf_rd_r=EPY70Y57HP1220DK033Y&pf_rd_s=merchandised-search-6&qid=1596817115&refinements=p_72%3A1318477031&s=apparel&sr=1-4&ts_id=1968123031"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0"}
http_proxy = "http://10.10.1.10:3128"
https_proxy = "https://10.10.1.11:1080"
ftp_proxy = "ftp://10.10.1.10:3128"
proxyDict = {
"http" : http_proxy,
"https" : https_proxy,
"ftp" : ftp_proxy
}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "lxml")
#print(soup.prettify())
title = soup.find(id = "productTitle")
if title:
title = title.get_text()
else:
title = "default_title"
print(title)

Categories