Python scraping email protection address from href link

Python scraping email protection address from href link - python

I want to get email adresses from :
[1]: https://thenationalweddingdirectory.com.au/suppliers/wedding-venues/queensland/the-dock-mooloolaba-events/
right now I have the code, but how can i scrap the email address from the clicked link?
from requests_html import HTMLSession
url = 'https://thenationalweddingdirectory.com.au/explore/?category=wedding-venues&region=melbourne&sort=top-rated'
s = HTMLSession()
r = s.get(url)
r.html.render(sleep=1)
products = r.html.xpath('//*[#id="finderListings"]/div[2]', first=True)
for item in products.absolute_links:
r = s.get(item)
print(r.html.find('li.lmb-calltoaction a', first=True))

Email, telephone is on the page, there are one json with all info you need.
Also you have some "ajax" request to get all URLs to visit.
import json
from bs4 import BeautifulSoup
import requests
import re
params = {
'mylisting-ajax': '1',
'action': 'get_listings',
'form_data[page]': '0',
'form_data[preserve_page]': 'false',
'form_data[category]': 'wedding-venues',
'form_data[region]': 'melbourne',
'form_data[sort]': 'top-rated',
'listing_type': 'place',
}
response = requests.get('https://thenationalweddingdirectory.com.au/', params=params)
# get all urls
results = re.findall("https://thenationalweddingdirectory.com.au/suppliers/wedding-venues/melbourne/[a-zA-Z-]*/",
response.text.replace("\\", ""))
headers = {
'accept': '*/*',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7,ru;q=0.6',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
for result in results:
print("Navigate: " + result)
response = requests.get(result, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
scripts = soup.find_all("script")
for script in scripts:
if "LocalBusiness" in script.text:
data = json.loads(script.text)
print("Name: " + data["name"])
print("Telephone: " + data["telephone"])
print("Email: " + data["email"])
break
OUTPUT:
Navigate: https://thenationalweddingdirectory.com.au/suppliers/wedding-venues/melbourne/metropolis-events/
Name: Metropolis Events
Telephone: 03 8537 7300
Email: info#metropolisevents.com.au
Navigate: https://thenationalweddingdirectory.com.au/suppliers/wedding-venues/melbourne/cotham-dining/
Name: Cotham Dining
Telephone: 0411 931 818
Email: hello#cothamdining.com.au

Related

Log in to a website with csrfmiddlewaretoken verification using python

I am using the following code to log in to a website with csrfmiddlewaretoken verification, but it throws me the following error:
"csrfmiddlewaretoken = HTML.find_all('input')[1]['value']
IndexError: list index out of range"
What do you think is the problem, I'm new using python :)
import requests
from bs4 import BeautifulSoup
request_url = 'https://text.gob.pe/accounts/login/'
with requests.session() as session:
get_url = session.get('https://text.gob.pe/accounts/login/')
HTML = BeautifulSoup(get_url.text, 'html.parser')
csrfmiddlewaretoken = HTML.find_all('input')[1]['value']
#logging in
payload = {
'next' : '/ profile /',
'username' : 'secret',
'password' : 'secret',
'next': '/ profile /',
'csrfmiddlewaretoken': csrfmiddlewaretoken
}
headers = {
'Referer': 'https://text.gob.pe/accounts/login/'
}
login_request = session.post(request_url,payload, headers=headers)
home_page = session.get("https://text.gob.pe/ficha/buscar/")
print(home_page.content)

Without a username and password, it is difficult to show how to proceed. At the moment your mistake is that you didn't specify headers
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'content-type': 'application/x-www-form-urlencoded',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
url = "https://logincovid19.minsa.gob.pe/accounts/login/"
session = requests.Session()
response = session.get(url, headers=headers)
csrfmiddlewaretoken = BeautifulSoup(response.text, 'lxml').find('input', {'name': 'csrfmiddlewaretoken'}).get('value')
Now csrfmiddlewaretoken is something like this - eyHLYFv7HOYxglzFS9a3JDxOT38u8mrakdwhatOnkvcJJzwN9dNi6olBxJxD1HZi
I think further the code will look like this, but you need to check it:
user = 'YourUserNmaeHere'
password = 'UserPaswordHere'
payload = f'csrfmiddlewaretoken={csrfmiddlewaretoken}&username={user}&password={password}'
response = session.post(url, data=payload, headers=headers)

Scrape Verify link Href From Sites

I want to get the Verify href from GmailnatorInbox and this site contains the href discord verify which is the following Discord Verify HREF
I want to get this href using bs4 and pass it into a selenium driver link like driver.get(url) the url being the href ofc.
Can someone make some code to scrape the href from the gmailnator inbox please? I did try the page source however the page source does not contain the href.
This is the code I have written to get the href but the href that I require (discord one) is in a frame source so I think that's why it doesnt come up.
UPDATE! EVERYTHING IS DONE AND FIXED
driver.get('https://www.gmailnator.com/inbox/#for.ev.e.r.my.girlt.m.p#gmail.com')
time.sleep(6)
driver.find_element_by_xpath('//*[#id="mailList"]/tbody/tr[2]/td/a/table/tbody/tr/td[1]').click()
time.sleep(4)
url = driver.current_url
email_for_data = driver.current_url.split('/')[-3]
print(url)
time.sleep(2)
print('Getting Your Discord Verify link')
print('Time To Get Your Discord Link')
soup = BeautifulSoup(requests.get(url).text, "lxml")
data_email = soup.find("")
token = soup.find("meta", {"name": "csrf-token"})["content"]
cf_email = soup.find("a", class_="__cf_email__")["data-cfemail"]
endpoint = "https://www.gmailnator.com/mailbox/get_single_message/"
data = {
"csrf_gmailnator_token": token,
"action": "get_message",
"message_id": url.split("#")[-1],
"email": f"{email_for_data}",
}
headers = {
"referer": f"https://www.gmailnator.com/{email_for_data}/messageid/",
"cookie": f"csrf_gmailnator_cookie={token}; ci_session={cf_email}",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.86 "
"YaBrowser/21.3.0.740 Yowser/2.5 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
r = requests.post(endpoint, data=data, headers=headers)
the_real_slim_shady = (
BeautifulSoup(r.json()["content"], "lxml")
.find_all("a", {"target": "_blank"})[1]["href"]
)
print(the_real_slim_shady)

You can fake it all with pure requests to get the Verify link. First, you need to get the token and the cf_email values. Then, things are pretty straightforward.
Here's how to get the link:
import requests
from bs4 import BeautifulSoup
url = "https://www.gmailnator.com/geralddoreyestmp/messageid/#179b454b4c482c4d"
soup = BeautifulSoup(requests.get(url).text, "lxml")
token = soup.find("meta", {"name": "csrf-token"})["content"]
cf_email = soup.find("a", class_="__cf_email__")["data-cfemail"]
endpoint = "https://www.gmailnator.com/mailbox/get_single_message/"
data = {
"csrf_gmailnator_token": token,
"action": "get_message",
"message_id": url.split("#")[-1],
"email": "geralddoreyestmp",
}
headers = {
"referer": "https://www.gmailnator.com/geralddoreyestmp/messageid/",
"cookie": f"csrf_gmailnator_cookie={token}; ci_session={cf_email}",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.86 "
"YaBrowser/21.3.0.740 Yowser/2.5 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
r = requests.post(endpoint, data=data, headers=headers)
the_real_slim_shady = (
BeautifulSoup(r.json()["content"], "lxml")
.find_all("a", {"target": "_blank"})[1]["href"]
)
print(the_real_slim_shady)
Output (your link will be different!):
https://click.discord.com/ls/click?upn=qDOo8cnwIoKzt0aLL1cBeARJoBrGSa2vu41A5vK-2B4us-3D77CR_3Tswyie9C2vHlXKXm6tJrQwhGg-2FvQ76GD2o0Zl2plCYHULNsKdCuB6s-2BHk1oNirSuR8goxCccVgwsQHdq1YYeGQki4wtPdDA3zi661IJL7H0cOYMH0IJ0t3sgrvr2oMX-2BJBA-2BWZzY42AwgjdQ-2BMAN9Y5ctocPNK-2FUQLxf6HQusMayIeATMiTO-2BlpDytu-2FnIW4axB32RYQpxPGO-2BeHtcSj7a7QeZmqK-2B-2FYkKA4dl5q8I-3D

How to scrape data from Amazon Canada?

I am trying to scrape data from amazon canada (amazon.ca). I am using requests and bs4 package to send & parse html data. I am not able to extract the data from the response. Can someone please help me in extracting information from response.
import requests
from bs4 import BeautifulSoup
# Define headers
headers={
'content-type': 'text/html;charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
# Amazon Canada product url
url = 'https://www.amazon.ca/INIU-High-Speed-Flashlight-Powerbank-Compatible/dp/B07CZDXDG8?ref_=Oct_s9_apbd_otopr_hd_bw_b3giFrP&pf_rd_r=69GE1K9DG49351YHSYBC&pf_rd_p=694b8fdf-0d96-57ba-b834-dc9bdeb7a094&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE&pf_rd_i=3379552011&th=1'
resp = requests.get(url,headers= header)
print(resp)
<Response [200]>
Earlier it was showing <Response [503]>, so I added headers, now it is showing <Response [200]>. So I am trying to extract some information from the page.
# Using html parser
soup = BeautifulSoup(resp.content,'lxml')
# Extracting information from page
product_title = soup.find('span',id='productTitle')
print('product_title -' ,product_title)
product_price = soup.find('span',id='priceblock_ourprice')
print('product_price -' ,product_price)
('product_title -', None)
('product_price -', None)
But it is showing None, So I checked what exactly data is present in soup. So I print the soup.
soup.text
'\n\n\n\nRobot Check\n\n\n\n\nif (true === true) {\n var ue_t0 = (+
new Date()),\n ue_csm = window,\n ue = { t0: ue_t0, d:
function() { return (+new Date() - ue_t0); } },\n ue_furl =
"fls-na.amazon.ca",\n ue_mid = "A2EUQ1WTGCTBG2",\n
ue_sid = (document.cookie.match(/session-id=([0-9-]+)/) || [])[1],\n
ue_sn = "opfcaptcha.amazon.ca",\n ue_id =
\'0B2HQATTKET8J6M36Y3G\';\n}\n\n\n\n\n\n\n\n\n\n\n\nEnter the
characters you see below\nSorry, we just need to make sure you\'re not
a robot. For best results, please make sure your browser is accepting
cookies.\n\n\n\n\n\n\n\n\n\n\nType the characters you see in this
image:\n\n\n\n\n\n\n\n\nTry different
image\n\n\n\n\n\n\n\n\n\n\n\nContinue
shopping\n\n\n\n\n\n\n\n\n\n\n\nConditions of Use &
Sale\n\n\n\n\nPrivacy Notice\n\n\n \xa9 1996-2015,
Amazon.com, Inc. or its affiliates\n \n if (true
=== true) {\n document.write(\'<img src="https://fls-na.amaz\'+\'on.ca/\'+\'1/oc-csi/1/OP/requestId=0B2HQATTKET8J6M36Y3G&js=1"
/>\');\n };\n \n\n\n\n\n\n\n if (true === true)
{\n var head = document.getElementsByTagName(\'head\')[0],\n
prefix =
"https://images-na.ssl-images-amazon.com/images/G/01/csminstrumentation/",\n
elem = document.createElement("script");\n elem.src = prefix +
"csm-captcha-instrumentation.min.js";\n
head.appendChild(elem);\n\n elem =
document.createElement("script");\n elem.src = prefix +
"rd-script-6d68177fa6061598e9509dc4b5bdd08d.js";\n
head.appendChild(elem);\n }\n \n\n'
I checked the output throughly, but I didn't found any data available in the response, I even tried to do the same and checked in resp.content, but didn't found any data. Also I validated the url, the url is valid too. I even tested above script by adding public proxies, but still no output.
Can someone please help me extract information from the url or any other way to get it done?.

Try this:
import requests
from bs4 import BeautifulSoup
headers = {
'content-type': 'text/html;charset=UTF-8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}
url = 'https://www.amazon.ca/INIU-High-Speed-Flashlight-Powerbank-Compatible/dp/B07CZDXDG8'
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.content, 'lxml')
# Extracting information from page
print('product_title -', soup.find('span', id='productTitle').text.strip())
print('product_price -', soup.find('span', id='priceblock_ourprice').text.strip())
The code yields:
product_title - INIU Power Bank, Ultra-Slim Dual 3A High-Speed Portable Charger, 10000mAh USB C Input & Flashlight External Phone Battery Pack for iPhone Xs X 8 Plus Samsung S10 Google LG iPad etc. [2020 Upgrade]
product_price - CDN$ 60.66

Can't parse some names and and their concerning urls from a webpage

I've created a python script using requests and BeautifulSoup to parse the profile names and the links to their profile names from a webpage. The content seems to generate dynamically but they are present in page source. So, I tried with the following but unfortunately I get nothing.
SiteLink
My attempt so far:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.century21.com/real-estate-agents/Dallas,TX'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'cache-control': 'max-age=0',
'cookie': 'JSESSIONID=8BF2F6FB5603A416DCFBAB8A3BB5A79E.app09-c21-id8; website_user_id=1255553501;',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
def get_info(link):
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".media__content"):
profileUrl = item.get("href")
profileName = item.select_one("[itemprop='name']").get_text()
print(profileUrl,profileName)
if __name__ == '__main__':
get_info(URL)
How can I fetch the content from that page?

The required content does available in page source. The site is very good at discarding requests when it is made using the same user-agent. So, I used fake_useragent to supply the same randomly with requests. It works if you don't use it incessantly.
Working solution:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from fake_useragent import UserAgent
URL = 'https://www.century21.com/real-estate-agents/Dallas,TX'
def get_info(s,link):
s.headers["User-Agent"] = ua.random
res = s.get(link)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".media__content a[itemprop='url']"):
profileUrl = urljoin(link,item.get("href"))
profileName = item.select_one("span[itemprop='name']").get_text()
print(profileUrl,profileName)
if __name__ == '__main__':
ua = UserAgent()
with requests.Session() as s:
get_info(s,URL)
Partial output:
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Stewart-Kipness-2657107a Stewart Kipness
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Andrea-Anglin-Bulin-2631495a Andrea Anglin Bulin
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Betty-DeVinney-2631507a Betty DeVinney
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Sabra-Waldman-2657945a Sabra Waldman
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Russell-Berry-2631447a Russell Berry

The page content is NOT rendered via javascript. Your code is fine in my case.
You have just some issue to find the profileUrl and to handle nonetype exception. You have to focus to the a tag to get the data
You should try this:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.century21.com/real-estate-agents/Dallas,TX'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'cache-control': 'max-age=0',
'cookie': 'JSESSIONID=8BF2F6FB5603A416DCFBAB8A3BB5A79E.app09-c21-id8; website_user_id=1255553501;',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
def get_info(link):
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"lxml")
results = []
for item in soup.select(".media__content"):
a_link = item.find('a')
if a_link:
result = {
'profileUrl': a_link.get('href'),
'profileName' : a_link.get_text()
}
results.append(result)
return results
if __name__ == '__main__':
info = get_info(URL)
print(info)
print(len(info))
OUTPUT:
[{'profileName': 'Stewart Kipness',
'profileUrl': '/CENTURY-21-Judge-Fite-Company-14501c/Stewart-Kipness-2657107a'},
....,
{'profileName': 'Courtney Melkus',
'profileUrl': '/CENTURY-21-Realty-Advisors-47551c/Courtney-Melkus-7389925a'}]
941

It looks like you can construct the url as well (Though does seem easier to just grab it)
import requests
from bs4 import BeautifulSoup as bs
URL = 'https://www.century21.com/real-estate-agents/Dallas,TX'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'cache-control': 'max-age=0',
'cookie': 'JSESSIONID=8BF2F6FB5603A416DCFBAB8A3BB5A79E.app09-c21-id8; website_user_id=1255553501;',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
r = requests.get(URL, headers = headers)
soup = bs(r.content, 'lxml')
items = soup.select('.media')
ids = []
names = []
urls = []
for item in items:
if item.select_one('[data-agent-id]') is not None:
anId = item.select_one('[data-agent-id]')['data-agent-id']
ids.append(anId)
name = item.select_one('[itemprop=name]').text.replace(' ','-')
names.append(name)
url = 'https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/' + name + '-' + anId + 'a'
urls.append(url)
results = list(zip(names, urls))
print(results)

Please try:
profileUrl = "https://www.century21.com/" + item.select("a")[0].get("href")

how to automatically use the session?

please help to deal with the authorization through the script. the problem is that it is impossible to automatically insert a get-request invbf_session_id.
import pprint
import requests
import re
import shelve
import bs4
def scanning_posts():
print('------------------------enter begin--------------------')
url = 'http://forum.saransk.ru/'
html = requests.get(url)
pprint.pprint(html.headers)
rawCookie = html.headers['Set-Cookie']
cookie = re.search(r"invbf_session_id=(.*?);", rawCookie).group(1)
pprint.pprint(cookie) # cookie != zzzzzzzzzzzzzzzzzzzzzz
html = requests.get(url)
soup = bs4.BeautifulSoup(html.text)
loginForm = soup.find('form', {'id': 'login'})
hiddenAuthKey = soup.find('input', {'name': 'auth_key'})['value']
authData = {
'ips_username': 'xxxxxx',
'ips_password': 'yyyyyy',
'auth_key': hiddenAuthKey,
'rememberMe': 1,
'referer': 'http://forum.saransk.ru/'
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Referer': 'http://forum.saransk.ru/forum/'
}
#pprint.pprint(authData)
print('\tlogin: ', authData['ips_username'])
cookie = dict(invbf_session_id='zzzzzzzzzzzzzzzzzzzzzz')
req = requests.get(url, params=authData, cookies=cookie, headers=header)
soup = bs4.BeautifulSoup(req.text)
signLinkNotLogged = soup.find('a', {'id': 'sign_in'})
if signLinkNotLogged:
print('------------------------enter failed--------------------')
else:
print('------------------------enter successful--------------------')
scanning_posts()
After running the script displays the wrong value invbf_session_id, as seen in FF firebug. respectively, authorization is not obtained.
if the value invbf_session_id copy of FF firebug and paste the script, the authorization is successful

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python scraping email protection address from href link - python

Related

Log in to a website with csrfmiddlewaretoken verification using python

Scrape Verify link Href From Sites

How to scrape data from Amazon Canada?

Can't parse some names and and their concerning urls from a webpage

how to automatically use the session?

Categories

Resources