Get value from web link

Get value from web link - python

I have a url from where I want to extract the line having data as "Underlying Stock: NCC 96.70 As on Jun 06, 2019 10:12:20 IST" and extract the Symbol which is "NCC" and Underlying Price is "96.70" into a list.
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"

You can make a request to the site and then parse the result with Beautiful Soup.
Try this:
from bs4 import BeautifulSoup
import requests
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
res = requests.get(url)
soup = BeautifulSoup(res.text)
# hacky way of finding and parsing the stock data
soup.get_text().split("Underlying Stock")[1][2:10].split(" ")
This prints out:
['NCC', '96.9']
PS: If you get a warning about lxml... It is the default parser given that you have installed it. Change this line then: soup = BeautifulSoup(res.text, features="lxml"). You need to have lxml installed e.g. with pip install lxml in your environment.

Another version, less hacky.
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
page_html = requests.get(url).text
page_soup = BeautifulSoup(page_html, "html.parser")
page_soup.find("b").next.split(' ')

A succinct way is to select for the first right aligned table cell (td[align=right]) ; which you can actually simplify to just the attribute, [align=right]:
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17')
soup = bs(r.content, 'lxml')
headline = soup.select_one('[align=right]').text.strip().replace('\xa0\n',' ')
print(headline)
You can also take first row of first table
from bs4 import BeautifulSoup
import requests
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17')
soup = bs(r.content, 'lxml')
table = soup.select_one('table')
headline = table.select_one('tr:nth-of-type(1)').text.replace('\n',' ').replace('\xa0', ' ').strip()
print(headline)

from bs4 import BeautifulSoup
import requests
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
res = requests.get(url)
soup = BeautifulSoup(res.text, "lxml")
# hacky way of finding and parsing the stock data
mylist = soup.get_text().split("Underlying Stock")[1][2:10].split(" ")
print(mylist[:2])
=============================
import pandas as pd
dict1 = {'SYMBOL': ['ACC','ADANIENT','ADANIPORTS','ADANIPOWER','AJANTPHARM','ALBK','AMARAJABAT','AMBUJACEM','APOLLOHOSP','APOLLOTYRE','ARVIND','ASHOKLEY','ASIANPAINT','AUROPHARMA','AXISBANK','BAJAJ-AUTO','BAJAJFINSV','BAJFINANCE','BALKRISIND','BANKBARODA','BANKINDIA','BANKNIFTY','BATAINDIA','BEL','BEML','BERGEPAINT','BHARATFIN','BHARATFORG','BHARTIARTL','BHEL','BIOCON','BOSCHLTD','BPCL','BRITANNIA','BSOFT','CADILAHC','CANBK','CANFINHOME','CASTROLIND','CEATLTD','CENTURYTEX','CESC','CGPOWER','CHENNPETRO','CHOLAFIN','CIPLA','COALINDIA','COLPAL','CONCOR','CUMMINSIND','DABUR','DCBBANK','DHFL','DISHTV','DIVISLAB','DLF','DRREDDY','EICHERMOT','ENGINERSIN','EQUITAS','ESCORTS','EXIDEIND','FEDERALBNK','GAIL','GLENMARK','GMRINFRA','GODFRYPHLP','GODREJCP','GODREJIND','GRASIM','GSFC','HAVELLS','HCLTECH','HDFC','HDFCBANK','HEROMOTOCO','HEXAWARE','HINDALCO','HINDPETRO','HINDUNILVR','HINDZINC','IBULHSGFIN','ICICIBANK','ICICIPRULI','IDBI','IDEA','IDFC','IDFCFIRSTB','IFCI','IGL','INDIACEM','INDIANB','INDIGO','INDUSINDBK','INFIBEAM','INFRATEL','INFY','IOC','IRB','ITC','JETAIRWAYS','JINDALSTEL','JISLJALEQS','JSWSTEEL','JUBLFOOD','JUSTDIAL','KAJARIACER','KOTAKBANK','KSCL','KTKBANK','L&TFH','LICHSGFIN','LT','LUPIN','M&M','M&MFIN','MANAPPURAM','MARICO','MARUTI','MCDOWELL-N','MCX','MFSL','MGL','MINDTREE','MOTHERSUMI','MRF','MRPL','MUTHOOTFIN','NATIONALUM','NBCC','NCC','NESTLEIND','NHPC','NIFTY','NIFTYIT','NIITTECH','NMDC','NTPC','OFSS','OIL','ONGC','ORIENTBANK','PAGEIND','PCJEWELLER','PEL','PETRONET','PFC','PIDILITIND','PNB','POWERGRID','PVR','RAMCOCEM','RAYMOND','RBLBANK','RECLTD','RELCAPITAL','RELIANCE','RELINFRA','REPCOHOME','RPOWER','SAIL','SBIN','SHREECEM','SIEMENS','SOUTHBANK','SRF','SRTRANSFIN','STAR','SUNPHARMA','SUNTV','SUZLON','SYNDIBANK','TATACHEM','TATACOMM','TATAELXSI','TATAGLOBAL','TATAMOTORS','TATAMTRDVR','TATAPOWER','TATASTEEL','TCS','TECHM','TITAN','TORNTPHARM','TORNTPOWER','TV18BRDCST','TVSMOTOR','UBL','UJJIVAN','ULTRACEMCO','UNIONBANK','UPL','VEDL','VGUARD','VOLTAS','WIPRO','WOCKPHARMA','YESBANK','ZEEL'],
'LOT_SIZE': [400,4000,2500,20000,500,13000,700,2500,500,3000,2000,4000,600,1000,1200,250,125,250,800,4000,6000,20,550,6000,700,2200,500,1200,1851,7500,900,30,1800,200,2250,1600,2000,1800,3400,400,600,550,12000,1800,500,1000,2200,700,1563,700,1250,4500,1500,8000,400,2600,250,25,4100,4000,1100,2000,7000,2667,1000,45000,700,600,1500,750,4700,1000,700,500,250,200,1500,3500,2100,300,3200,500,1375,1500,10000,19868,13200,12000,35000,2750,4500,2000,600,300,4000,2000,1200,3500,3200,2400,2200,2250,9000,1500,500,1400,1300,400,1500,4700,4500,1100,375,700,1000,1250,6000,2600,75,1250,700,1200,600,600,2850,10,7000,1500,8000,8000,8000,50,27000,75,50,750,6000,4800,150,3399,3750,7000,25,6500,302,3000,6200,500,7000,4000,400,800,800,1200,6000,1500,500,1300,1100,16000,12000,3000,50,550,33141,250,600,1100,1100,1000,76000,15000,750,1000,400,2250,2000,3800,9000,1061,250,1200,750,500,3000,13000,1000,700,1600,200,7000,600,2300,3000,1000,3200,900,1750,1300]}
df1 = pd.DataFrame(dict1)
dict2 = {'SYMBOL': ['INFY', 'TATAMOTORS', 'IDBI', 'BHEL', 'LT'],
'LTP': ['55', '66', '77', '88', '99'],
'PRICE': ['0.25', '0.36', '0.12', '0.28', '0.85']}
df2 = pd.DataFrame(dict2)
print(df1,'\n\n')
print(df2,'\n\n')
df2['LOT_SIZE']=df2[['SYMBOL']].merge(df1,how='left').LOT_SIZE
print(df2)

Related

lxml to grab All items that share a certain xpath

I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)

You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)

How to scrape the yahoo earnings calendar with beautifulsoup

How can I scrape the yahoo earnings calendar to pull out the dates?
This is for python 3.
from bs4 import BeautifulSoup as soup
import urllib
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find('p')
print(table)
the output is "None"

Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation
from bs4 import BeautifulSoup as soup
import urllib.request
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find_all('td')
Dates = []
for something in table:
try:
if something['aria-label'] == "Earnings Date":
Dates.append(something.text)
except:
print('')
print(Dates)

Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines:
import pandas as pd
earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]

Here are two succinct ways
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1')
soup = bs(r.content, 'lxml')
# using attribute = value selector
dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')]
#using nth-of-type to get column
dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]

Get a <span> value using python web scrape

I am trying to get a product price using BeautifulSoup in python.
But i keep getting erroes, no matter what I try.
The picture of the site i am trying to web scrape
I want to get the 19,90 value.
I have already done a code to get all the product names, and now need their prices.
import requests
from bs4 import BeautifulSoup
url = 'https://www.zattini.com.br/busca?nsCat=Natural&q=amaro&searchTermCapitalized=Amaro&page=1'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
price = soup.find('span', itemprop_='price')
print(price)

Less ideal is parsing out the JSON containing the prices
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.zattini.com.br/busca?nsCat=Natural&q=amaro&searchTermCapitalized=Amaro&page=1'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
scripts = [script.text for script in soup.select('script') if 'var freedom = freedom ||' in script.text]
pricesJson = scripts[0].split('"items":')[1].split(']')[0] + ']'
prices = [item['price'] for item in json.loads(pricesJson)]
names = [name.text for name in soup.select('#item-list [itemprop=name]')]
results = list(zip(names,prices))
df = pd.DataFrame(results)
print(df)
Sample output:

span[itemprop='price'] is generated by javascript. Original value stored in div[data-final-price] with value like 1990 and you can format it to 19,90 with Regex.
import re
...
soup = BeautifulSoup(page.text, 'html.parser')
prices = soup.select('div[data-final-price]')
for price in prices:
price = re.sub(r'(\d\d$)', r',\1', price['data-final-price'])
print(price)
Results:
19,90
134,89
29,90
119,90
104,90
59,90
....

Beautiful Soup PYTHON - inside tags

Little problem with BeautifulSoup:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find_all('div', attrs={'class':'fl'}):
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
And it gives me twice records :V That probably is easy to solve :V

The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in
soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
Full code:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
print(len(par_))
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])

How about this. I used css selectors to do the same.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select('.fl p a'):
print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href'])))
Partial Output:
Item: CNNVD-201712-811
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811
Item: CNNVD-201712-810
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810
Item: CNNVD-201712-809
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809

web scraping with beautiful soup

I have the following code to extract the latest MS office version for mac:
import urllib2
from bs4 import BeautifulSoup
quote_page = 'https://support.office.com/en-us/article/Update-history-
for-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('p', attrs={'class': 'x-hidden-focus'})
print name_box
I'm trying to scrape Office 2016 for Mac (all applications)
15.39.0
I'm getting None as the output.
any help is appreciated. thank you.

This works, explanation is given in comments.
import requests
import bs4
url = 'https://support.office.com/en-us/article/Update-history-for-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current'
table_id = 'tblID0EAGAAA'
resp= requests.get(url)
soup = bs4.BeautifulSoup(resp.text, 'lxml')
# find table that contains data of interest
table = soup.find('table', {'id' : table_id})
# get the second row in that table
second_row = table.findAll('tr')[1]
# get the second column in that row
second_column = second_row.findAll('td')[1]
# get the content in this cell
version = second_column.find('p').text
print(version)

A solution that doesn't depend on the table id (which very well could change after every release) or the ordering of the rows:
from bs4 import BeautifulSoup
import requests
import re
page = requests.get('https://support.office.com/en-us/article/Update-history-or-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current')
pattern = re.compile(r'^Office.+Mac.*')
version = BeautifulSoup(page.content, 'html.parser') \
.select_one('section.ocpSection table tbody') \
.find('p', text=pattern) \
.parent \
.find_next_sibling('td') \
.select_one('p') \
.text
print(version)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Get value from web link - python

Related

lxml to grab All items that share a certain xpath

How to scrape the yahoo earnings calendar with beautifulsoup

Get a <span> value using python web scrape

Beautiful Soup PYTHON - inside tags

web scraping with beautiful soup

Categories

Resources