Scrape and save the data in to csv in beautiful soup - python

Below is the url to scrape
https://www.agtta.co.in/individuals.php
I need to extract Name, Mobile number, and Email
I need to save into csv after that
I am able scrape the data full data with below code
Extract using user agent below is the code
from bs4 import BeautifulSoup
import urllib.request
urls=['https://www.agtta.co.in/individuals.php']
for url in urls:
req = urllib.request.Request(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
resp= urllib.request.urlopen(req)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
scrape_data = soup.find('section', class_='b-branches')
to_list = scrape_data .find_all_next(string=True)
I tried with
for biz in results:
#print(biz)
title = biz.findAll('h3', {'class': 'b-branches__title ui-title-inner ui-title-inner_lg'})
print (title)
I m getting [<h3 class="b-branches__title ui-title-inner ui-title-inner_lg">SHRI RAMESHBHAI P. SAKARIYA</h3>]
Tag is coming while extracting How to remove the tag
My expected out
Name, Mobilenumber, Email
A, 333, mm#gmail.com`

from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
urls=['https://www.agtta.co.in/individuals.php']
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
for url in urls:
req = urllib.request.Request(url, headers=headers)
resp= urllib.request.urlopen(req)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
result = []
for individual in soup.findAll("section", {"class": "b-branches"}):
name = individual.h3.text
phone_data = individual.find('p')
phone = phone_data.text.replace("Mobile No","").strip() if phone_data else ""
email_data = individual.select('div:contains("Email")')
email = email_data[0].text.replace("Email","").strip() if email_data else ""
result.append({"Name":name, "Phone": phone, "Email":email})
output = pd.DataFrame(result)
output.to_csv("Details.csv",index = False)

Here is the full code to do it:
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get('https://www.agtta.co.in/individuals.php',headers = headers).text
soup = BeautifulSoup(r,'html5lib')
sections = soup.find_all('section',class_ = "b-branches")
names = []
phone_numbers = []
emails = []
for section in sections:
name = section.h3.text
names.append(name)
phone_number = section.p.text
phone_number = phone_number.split('Mobile No ')[1]
phone_numbers.append(phone_number)
try:
email = section.find_all('div')[3].text
email = email.split('Email ')[1]
emails.append(email)
except:
emails.append(None)
details_dict = {"Names":names,
"Phone Numbers":phone_numbers,
"Emails":emails}
df = pd.DataFrame(details_dict)
df.to_csv("Details.csv",index = False)
Output:
Hope that this helps!

Related

Can't retrieve an email from a webpage using the requests module

I'm trying to fetch an email from a webpage using requests module. The problem is, the email address seems to be encoded or something, which is why it is unreadable, and I wish to decode it in its usual form.
import requests
from bs4 import BeautifulSoup
link = 'https://global-standard.org/find-suppliers-shops-and-inputs/certified-suppliers/database/search_result/38996'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
email = soup.select_one("script[type='text/javascript']:-soup-contains('emailProtector')").contents[0]
print(email)
When I run the above script, the following is what I get:
emailProtector.addCloakedMailto("ep_586c4771", 1);
This is the result I'm after:
fttextilegroup2017#gmail.com
You can try:
import re
import requests
from bs4 import BeautifulSoup
url = 'https://global-standard.org/find-suppliers-shops-and-inputs/certified-suppliers/database/search_result/38996'
def decloak(cloaked_tag, attr_name):
a, b = "" , ""
for span in cloaked_tag.select('span'):
for attr in span.attrs:
if attr == attr_name:
a += span[attr]
else:
b = span[attr] + b
return a + b
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
attr_name = re.search(r'nodeName\.toLowerCase\(\)\.indexOf\("(.*?)"', str(soup)).group(1)
mail = decloak(soup.select_one('.cloaked_email'), attr_name)
print(mail)
Prints:
fttextilegroup2017#gmail.com

getting an empty list when trying to extract urls from google with beautifulsoup

I am trying to extract the first 100 urls that return from a location search in google
however i am getting an empty list every time ("no results found")
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = []
if results:
counter = 0
for result in results:
websites.append(result.find("a")["href"])
counter += 1
if counter == 100:
break
else:
print("No search results found.")
return websites
location = "Athens"
print(get_location_info(location))
No search results found.
[]
I have also tried this approach :
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = [result.find("a")["href"] for result in results][:10]
return websites
location = "sifnos"
print(get_location_info(location))`
and i get an empty list. I think i am doing everything suggested in similar posts but i still get nothing
Always and first of all, take a look at your soup to see if all the expected ingredients are in place.
Select your elements more specific in this case for example with css selector:
[a.get('href') for a in soup.select('a:has(>h3)')]
To void consent banner also send some cookies:
cookies={'CONSENT':'YES+'}
Example
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers, cookies={'CONSENT':'YES+'})
soup = BeautifulSoup(response.text, 'html.parser')
websites = [a.get('href') for a in soup.select('a:has(>h3)')]
return websites
location = "sifnos"
print(get_location_info(location))
Output
['https://www.griechenland.de/sifnos/', 'http://de.sifnos-greece.com/plan-trip-to-sifnos/travel-information.php', 'https://www.sifnosisland.gr/', 'https://www.visitgreece.gr/islands/cyclades/sifnos/', 'http://www.griechenland-insel.de/Hauptseiten/sifnos.htm', 'https://worldonabudget.de/sifnos-griechenland/', 'https://goodmorningworld.de/sifnos-griechenland/', 'https://de.wikipedia.org/wiki/Sifnos', 'https://sifnos.gr/en/sifnos/', 'https://www.discovergreece.com/de/cyclades/sifnos']

How to extract key info from <script> tag

I'm trying to extract the user id from this link
https://www.instagram.com/design.kaf/
using bs4 and Regex
Found a JSON key inside script tag called "profile_id"
but I can't even search that script tag
You can find my try in regex here
https://regex101.com/r/WmlAEc/1
Also I can't find something I can pull this certain <script> tag
my code :
url= "https://www.instagram.com/design.kaf/"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36'
}
response = requests.request("GET", url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
a=str(soup.findall("script"))
x = re.findall('profile_id":"-?\d+"', a)
id = int(x[0])
print(id)
Here is another answer using re approach
import requests
from bs4 import BeautifulSoup
import re, ast
url = 'https://www.instagram.com/design.kaf/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36'
}
r = requests.request("GET", url)
soup = BeautifulSoup(r.text, 'html.parser')
s = soup.findAll('script')
s = str(s)
# this will print "profile_id":"5172989370"
to_be_find_string = re.findall('"profile_id":"-?\d+"', s)[0] # changed you regex by adding a double quote at the beginning
string_formatted_as_dict = '{'+ to_be_find_string + '}'
# it will convert a type <str> formatted as dict to type <dict>
profile_dict = ast.literal_eval(string_formatted_as_dict)
print(profile_dict['profile_id']) # print your user id i.e. 5172989370
you can try this code, it is an approach with loop and string search
import requests
from bs4 import BeautifulSoup
url = 'https://www.instagram.com/design.kaf/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36'
}
r = requests.request("GET", url)
soup = BeautifulSoup(r.text, 'html.parser')
s = soup.findAll('script')
s = str(s)
id_str, counter = '', 0
counter = 0
while True:
# our required string format "profile_id":"0123456789....",
str_to_find = '"profile_id":"'
index_p = s.find(str_to_find) # returns the index of first character i.e. double quote
# first number of id will start from index_p + length of the searched string
if s[index_p+len(str_to_find)+counter] == '"':
break # iteration will stop when we again find double quote
else:
id_str += s[index_p+len(str_to_find)+counter]
counter += 1
print(id_str) # print 5172989370 in this case

Collect the Dropdown List from Link using Request

I have a link as below:
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=BANKNIFTY&date=9JAN2020"
I want to collect all the Expiry Date available as per the image below:
My Code:
########################
import pandas as pd
from requests import Session
import os, time, sys
from datetime import datetime
s = Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
# Add headers
s.headers.update(headers)
URL = 'https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp'
params = {'symbolCode':9999,'symbol':'BANKNIFTY','instrument': '-','date': '9JAN2020','segmentLink': 17}
res = s.get(URL, params=params)
df1 = pd.read_html(res.content)[0]
df2 = pd.read_html(res.content)[1]
Not able to get the values in df1 nor df2
It needs minimal knowlege of requests and BeautifulSoup or lxml
import requests
import lxml.html
url = 'https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=BANKNIFTY&date=9JAN2020'
r = requests.get(url)
soup = lxml.html.fromstring(r.text)
items = soup.xpath('//form[#id="ocForm"]//option/text()')
print(items)
Result
[' Select ', '9JAN2020', '16JAN2020', '23JAN2020', '30JAN2020', '6FEB2020', '13FEB2020', '20FEB2020', '27FEB2020', '5MAR2020', '26MAR2020']
import pandas as pd
from requests import Session
import lxml.html
s = Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
# Add headers
s.headers.update(headers)
URL = 'https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp'
params = {'symbolCode':9999,'symbol':'BANKNIFTY','instrument': 'OPTIDX','date': '-','segmentLink': 17}
res = s.get(URL, params=params)
soup = lxml.html.fromstring(res.text)
items = soup.xpath('//form[#id="ocForm"]//option/text()')
print(items)
text = pd.read_html(res.content)[0].loc[0, 1]
print(text)

is get_text() from bs4 different for span tags? Cant remove span tags

Whilst making a web scraper i am able to find an scrape the data available.
on 2 fields of data i am able to use the beautifulsoup get_text() to reomve html from the data
but the 3rd fields will not work when i use get_text(). I can get it to give me the whole span tag just not the text inside it.
i have tried different iterations of getting the data all the same, it will give me the whole span tag ie. stuff
Trying to set busnumber to the phone number inside this span tag
<span class="business--telephoneNumber" itemprop="telephone">01430 422826 </span>
ive tried
from bs4 import BeautifulSoup
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5);
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
#print(content)
questions = content.find_all(class_='businessCapsule')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnum = question.find('span', {'itemprop': 'telephone'})
print(busnum)
busnumber = busnum.get_text()
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
as well as
from bs4 import BeautifulSoup
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5);
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
#print(content)
questions = content.find_all(class_='businessCapsule')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.find('span', {'itemprop': 'telephone'}).get_text()
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
on both cases the get_text() gives this error
Traceback (most recent call last):
File "webscraper2.py", line 22, in <module>
busnumber = busnum.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'
if get_text is removed it will give the whole tag
<span class="business--telephoneNumber" itemprop="telephone">01430 422826 </span>
i only need the insed phone number.
update - latest code
from bs4 import BeautifulSoup as bs
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'html.parser')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.select_one('span.business--telephoneNumber').text
print(busnumber)
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
You need to get a different parent in order to select the appropriate child and change your selector for the child as shown below:
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'lxml')
questions = soup.select('.businessCapsule--mainContent:has(span.business--telephoneNumber)')
for question in questions:
print(question.select_one('span.business--telephoneNumber').text)
If you check this different parent selector you will see it selects the entire box with info in so you can then select your various children
If that is too retrictive you can test if tel was present
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'lxml')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
tel = question.select_one('span.business--telephoneNumber')
if tel is None:
tel = 'Not present'
else:
tel = tel.text
print(tel)

Categories