Collect the Dropdown List from Link using Request - python

I have a link as below:
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=BANKNIFTY&date=9JAN2020"
I want to collect all the Expiry Date available as per the image below:
My Code:
########################
import pandas as pd
from requests import Session
import os, time, sys
from datetime import datetime
s = Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
# Add headers
s.headers.update(headers)
URL = 'https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp'
params = {'symbolCode':9999,'symbol':'BANKNIFTY','instrument': '-','date': '9JAN2020','segmentLink': 17}
res = s.get(URL, params=params)
df1 = pd.read_html(res.content)[0]
df2 = pd.read_html(res.content)[1]
Not able to get the values in df1 nor df2

It needs minimal knowlege of requests and BeautifulSoup or lxml
import requests
import lxml.html
url = 'https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=BANKNIFTY&date=9JAN2020'
r = requests.get(url)
soup = lxml.html.fromstring(r.text)
items = soup.xpath('//form[#id="ocForm"]//option/text()')
print(items)
Result
[' Select ', '9JAN2020', '16JAN2020', '23JAN2020', '30JAN2020', '6FEB2020', '13FEB2020', '20FEB2020', '27FEB2020', '5MAR2020', '26MAR2020']

import pandas as pd
from requests import Session
import lxml.html
s = Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
# Add headers
s.headers.update(headers)
URL = 'https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp'
params = {'symbolCode':9999,'symbol':'BANKNIFTY','instrument': 'OPTIDX','date': '-','segmentLink': 17}
res = s.get(URL, params=params)
soup = lxml.html.fromstring(res.text)
items = soup.xpath('//form[#id="ocForm"]//option/text()')
print(items)
text = pd.read_html(res.content)[0].loc[0, 1]
print(text)

Related

How to extract text from the different id from beautifulsoup

I want to extract from the id but every id has different value check it:
div',id='statement80863
div',id='statement26092
and so on ............................
CODE
import requests
from bs4 import BeautifulSoup
import re
limit = 100
url = f'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=&ETHNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F{limit}&SORT=LASTNAME'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.find_all('div', {'class':'row'})
for row in rows:
des=row.find('div',id='statement80863').text
print(des)
You can use Regular Expressions to select only such <div> tags.
row.find('div', {'id': re.compile('^statement.*')}) - will select all the <div> tags that has an id which starts with the word statement.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=&ETHNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F100&SORT=LASTNAME'
headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
rows = soup.find_all('div', class_='row')
for row in rows:
d = row.find('div', {'id': re.compile('^statement*')})
if d:
# Your scraping code here...

Trouble using pandas read_html() : ValueError

from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
url = "https://finance.naver.com/item/sise_day.nhn?code=068270&page=1"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
res = requests.get(url, verify=True, headers=headers)
with urlopen(url) as doc:
html = BeautifulSoup(res.text, 'lxml')
pgrr = html.find('td', class_='pgRR')
s = str(pgrr.a['href']).split('=')
last_page = s[-1]
df = pd.DataFrame()
sise_url = 'http://finance.naver.com/item/sise_day.nhn?code=068270'
for page in range(1, int(last_page)+1):
page_url = '{}&page={}'.format(sise_url, page)
df = df.append(pd.read_html(page_url, encoding='euc-kr', header='0')[0])
df = df.dropna() # 값이 빠진 행을 제거한다.
print(df)
I'm having this Value error while crawling the Daily stock data in Naver Finance.
I have no trouble getting the url but if i use the read_html() i have Value Error:Table not found issue from the line df = df.append(pd.read_html(page_url, encoding='euc-kr', header='0')[0]). Pls give some advice.
I don't read Korean... however pd.read_html() was getting an error page. Resolved this by requests.get() with headers. Then pass res.text to read_html()
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
url = "https://finance.naver.com/item/sise_day.nhn?code=068270&page=1"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
res = requests.get(url, verify=True, headers=headers)
with urlopen(url) as doc:
html = BeautifulSoup(res.text, 'lxml')
pgrr = html.find('td', class_='pgRR')
s = str(pgrr.a['href']).split('=')
last_page = s[-1]
df = pd.DataFrame()
sise_url = 'http://finance.naver.com/item/sise_day.nhn?code=068270'
for page in range(1, int(last_page)+1):
page_url = '{}&page={}'.format(sise_url, page)
res = requests.get(page_url, verify=True, headers=headers)
df = df.append(pd.read_html(res.text, encoding='euc-kr')[0])

Scrape and save the data in to csv in beautiful soup

Below is the url to scrape
https://www.agtta.co.in/individuals.php
I need to extract Name, Mobile number, and Email
I need to save into csv after that
I am able scrape the data full data with below code
Extract using user agent below is the code
from bs4 import BeautifulSoup
import urllib.request
urls=['https://www.agtta.co.in/individuals.php']
for url in urls:
req = urllib.request.Request(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
resp= urllib.request.urlopen(req)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
scrape_data = soup.find('section', class_='b-branches')
to_list = scrape_data .find_all_next(string=True)
I tried with
for biz in results:
#print(biz)
title = biz.findAll('h3', {'class': 'b-branches__title ui-title-inner ui-title-inner_lg'})
print (title)
I m getting [<h3 class="b-branches__title ui-title-inner ui-title-inner_lg">SHRI RAMESHBHAI P. SAKARIYA</h3>]
Tag is coming while extracting How to remove the tag
My expected out
Name, Mobilenumber, Email
A, 333, mm#gmail.com`
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
urls=['https://www.agtta.co.in/individuals.php']
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
for url in urls:
req = urllib.request.Request(url, headers=headers)
resp= urllib.request.urlopen(req)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
result = []
for individual in soup.findAll("section", {"class": "b-branches"}):
name = individual.h3.text
phone_data = individual.find('p')
phone = phone_data.text.replace("Mobile No","").strip() if phone_data else ""
email_data = individual.select('div:contains("Email")')
email = email_data[0].text.replace("Email","").strip() if email_data else ""
result.append({"Name":name, "Phone": phone, "Email":email})
output = pd.DataFrame(result)
output.to_csv("Details.csv",index = False)
Here is the full code to do it:
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get('https://www.agtta.co.in/individuals.php',headers = headers).text
soup = BeautifulSoup(r,'html5lib')
sections = soup.find_all('section',class_ = "b-branches")
names = []
phone_numbers = []
emails = []
for section in sections:
name = section.h3.text
names.append(name)
phone_number = section.p.text
phone_number = phone_number.split('Mobile No ')[1]
phone_numbers.append(phone_number)
try:
email = section.find_all('div')[3].text
email = email.split('Email ')[1]
emails.append(email)
except:
emails.append(None)
details_dict = {"Names":names,
"Phone Numbers":phone_numbers,
"Emails":emails}
df = pd.DataFrame(details_dict)
df.to_csv("Details.csv",index = False)
Output:
Hope that this helps!

is get_text() from bs4 different for span tags? Cant remove span tags

Whilst making a web scraper i am able to find an scrape the data available.
on 2 fields of data i am able to use the beautifulsoup get_text() to reomve html from the data
but the 3rd fields will not work when i use get_text(). I can get it to give me the whole span tag just not the text inside it.
i have tried different iterations of getting the data all the same, it will give me the whole span tag ie. stuff
Trying to set busnumber to the phone number inside this span tag
<span class="business--telephoneNumber" itemprop="telephone">01430 422826 </span>
ive tried
from bs4 import BeautifulSoup
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5);
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
#print(content)
questions = content.find_all(class_='businessCapsule')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnum = question.find('span', {'itemprop': 'telephone'})
print(busnum)
busnumber = busnum.get_text()
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
as well as
from bs4 import BeautifulSoup
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5);
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
#print(content)
questions = content.find_all(class_='businessCapsule')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.find('span', {'itemprop': 'telephone'}).get_text()
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
on both cases the get_text() gives this error
Traceback (most recent call last):
File "webscraper2.py", line 22, in <module>
busnumber = busnum.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'
if get_text is removed it will give the whole tag
<span class="business--telephoneNumber" itemprop="telephone">01430 422826 </span>
i only need the insed phone number.
update - latest code
from bs4 import BeautifulSoup as bs
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'html.parser')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.select_one('span.business--telephoneNumber').text
print(busnumber)
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
You need to get a different parent in order to select the appropriate child and change your selector for the child as shown below:
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'lxml')
questions = soup.select('.businessCapsule--mainContent:has(span.business--telephoneNumber)')
for question in questions:
print(question.select_one('span.business--telephoneNumber').text)
If you check this different parent selector you will see it selects the entire box with info in so you can then select your various children
If that is too retrictive you can test if tel was present
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'lxml')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
tel = question.select_one('span.business--telephoneNumber')
if tel is None:
tel = 'Not present'
else:
tel = tel.text
print(tel)

Web scraping twitter

I want to do web scraping on twitter page to download tweets on a specific search word. I am not able to fetch recursively all the tweets, rather I can fetch 20 tweets. Please help to fetch all the tweets recursively. Below is the code
from bs4 import BeautifulSoup
import requests
import pandas as pd
company_name = 'ABC'
url = 'https://twitter.com/search?q=%23%27%20%20%20' + company_name + '&src=typd&lang=en'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
req = requests.get(url, headers=headers);#print(req)
data = req.text;# print(data)
# soup = BeautifulSoup(data, "lxml");# print(soup)
soup = BeautifulSoup(data, "html.parser");# print(soup)
tweets = [p.text for p in soup.findAll('p', class_='tweet-text')]
# print(tweets)
df = pd.DataFrame()
df['Tweet'] = tweets
print(df.head())
print(df.shape)

Categories