Get value from web link - python
I have a url from where I want to extract the line having data as "Underlying Stock: NCC 96.70 As on Jun 06, 2019 10:12:20 IST" and extract the Symbol which is "NCC" and Underlying Price is "96.70" into a list.
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
You can make a request to the site and then parse the result with Beautiful Soup.
Try this:
from bs4 import BeautifulSoup
import requests
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
res = requests.get(url)
soup = BeautifulSoup(res.text)
# hacky way of finding and parsing the stock data
soup.get_text().split("Underlying Stock")[1][2:10].split(" ")
This prints out:
['NCC', '96.9']
PS: If you get a warning about lxml... It is the default parser given that you have installed it. Change this line then: soup = BeautifulSoup(res.text, features="lxml"). You need to have lxml installed e.g. with pip install lxml in your environment.
Another version, less hacky.
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
page_html = requests.get(url).text
page_soup = BeautifulSoup(page_html, "html.parser")
page_soup.find("b").next.split(' ')
A succinct way is to select for the first right aligned table cell (td[align=right]) ; which you can actually simplify to just the attribute, [align=right]:
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17')
soup = bs(r.content, 'lxml')
headline = soup.select_one('[align=right]').text.strip().replace('\xa0\n',' ')
print(headline)
You can also take first row of first table
from bs4 import BeautifulSoup
import requests
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17')
soup = bs(r.content, 'lxml')
table = soup.select_one('table')
headline = table.select_one('tr:nth-of-type(1)').text.replace('\n',' ').replace('\xa0', ' ').strip()
print(headline)
from bs4 import BeautifulSoup
import requests
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
res = requests.get(url)
soup = BeautifulSoup(res.text, "lxml")
# hacky way of finding and parsing the stock data
mylist = soup.get_text().split("Underlying Stock")[1][2:10].split(" ")
print(mylist[:2])
=============================
import pandas as pd
dict1 = {'SYMBOL': ['ACC','ADANIENT','ADANIPORTS','ADANIPOWER','AJANTPHARM','ALBK','AMARAJABAT','AMBUJACEM','APOLLOHOSP','APOLLOTYRE','ARVIND','ASHOKLEY','ASIANPAINT','AUROPHARMA','AXISBANK','BAJAJ-AUTO','BAJAJFINSV','BAJFINANCE','BALKRISIND','BANKBARODA','BANKINDIA','BANKNIFTY','BATAINDIA','BEL','BEML','BERGEPAINT','BHARATFIN','BHARATFORG','BHARTIARTL','BHEL','BIOCON','BOSCHLTD','BPCL','BRITANNIA','BSOFT','CADILAHC','CANBK','CANFINHOME','CASTROLIND','CEATLTD','CENTURYTEX','CESC','CGPOWER','CHENNPETRO','CHOLAFIN','CIPLA','COALINDIA','COLPAL','CONCOR','CUMMINSIND','DABUR','DCBBANK','DHFL','DISHTV','DIVISLAB','DLF','DRREDDY','EICHERMOT','ENGINERSIN','EQUITAS','ESCORTS','EXIDEIND','FEDERALBNK','GAIL','GLENMARK','GMRINFRA','GODFRYPHLP','GODREJCP','GODREJIND','GRASIM','GSFC','HAVELLS','HCLTECH','HDFC','HDFCBANK','HEROMOTOCO','HEXAWARE','HINDALCO','HINDPETRO','HINDUNILVR','HINDZINC','IBULHSGFIN','ICICIBANK','ICICIPRULI','IDBI','IDEA','IDFC','IDFCFIRSTB','IFCI','IGL','INDIACEM','INDIANB','INDIGO','INDUSINDBK','INFIBEAM','INFRATEL','INFY','IOC','IRB','ITC','JETAIRWAYS','JINDALSTEL','JISLJALEQS','JSWSTEEL','JUBLFOOD','JUSTDIAL','KAJARIACER','KOTAKBANK','KSCL','KTKBANK','L&TFH','LICHSGFIN','LT','LUPIN','M&M','M&MFIN','MANAPPURAM','MARICO','MARUTI','MCDOWELL-N','MCX','MFSL','MGL','MINDTREE','MOTHERSUMI','MRF','MRPL','MUTHOOTFIN','NATIONALUM','NBCC','NCC','NESTLEIND','NHPC','NIFTY','NIFTYIT','NIITTECH','NMDC','NTPC','OFSS','OIL','ONGC','ORIENTBANK','PAGEIND','PCJEWELLER','PEL','PETRONET','PFC','PIDILITIND','PNB','POWERGRID','PVR','RAMCOCEM','RAYMOND','RBLBANK','RECLTD','RELCAPITAL','RELIANCE','RELINFRA','REPCOHOME','RPOWER','SAIL','SBIN','SHREECEM','SIEMENS','SOUTHBANK','SRF','SRTRANSFIN','STAR','SUNPHARMA','SUNTV','SUZLON','SYNDIBANK','TATACHEM','TATACOMM','TATAELXSI','TATAGLOBAL','TATAMOTORS','TATAMTRDVR','TATAPOWER','TATASTEEL','TCS','TECHM','TITAN','TORNTPHARM','TORNTPOWER','TV18BRDCST','TVSMOTOR','UBL','UJJIVAN','ULTRACEMCO','UNIONBANK','UPL','VEDL','VGUARD','VOLTAS','WIPRO','WOCKPHARMA','YESBANK','ZEEL'],
'LOT_SIZE': [400,4000,2500,20000,500,13000,700,2500,500,3000,2000,4000,600,1000,1200,250,125,250,800,4000,6000,20,550,6000,700,2200,500,1200,1851,7500,900,30,1800,200,2250,1600,2000,1800,3400,400,600,550,12000,1800,500,1000,2200,700,1563,700,1250,4500,1500,8000,400,2600,250,25,4100,4000,1100,2000,7000,2667,1000,45000,700,600,1500,750,4700,1000,700,500,250,200,1500,3500,2100,300,3200,500,1375,1500,10000,19868,13200,12000,35000,2750,4500,2000,600,300,4000,2000,1200,3500,3200,2400,2200,2250,9000,1500,500,1400,1300,400,1500,4700,4500,1100,375,700,1000,1250,6000,2600,75,1250,700,1200,600,600,2850,10,7000,1500,8000,8000,8000,50,27000,75,50,750,6000,4800,150,3399,3750,7000,25,6500,302,3000,6200,500,7000,4000,400,800,800,1200,6000,1500,500,1300,1100,16000,12000,3000,50,550,33141,250,600,1100,1100,1000,76000,15000,750,1000,400,2250,2000,3800,9000,1061,250,1200,750,500,3000,13000,1000,700,1600,200,7000,600,2300,3000,1000,3200,900,1750,1300]}
df1 = pd.DataFrame(dict1)
dict2 = {'SYMBOL': ['INFY', 'TATAMOTORS', 'IDBI', 'BHEL', 'LT'],
'LTP': ['55', '66', '77', '88', '99'],
'PRICE': ['0.25', '0.36', '0.12', '0.28', '0.85']}
df2 = pd.DataFrame(dict2)
print(df1,'\n\n')
print(df2,'\n\n')
df2['LOT_SIZE']=df2[['SYMBOL']].merge(df1,how='left').LOT_SIZE
print(df2)
Related
lxml to grab All items that share a certain xpath
I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you: webpage = requests.get(URL, headers=HEADERS) soup = BeautifulSoup(webpage.content, "html.parser") dom = etree.HTML(str(soup)) print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text) This successfully prints the 1st price!!! I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range". Then I was trying to think of some For loop that would print All Items, so I could create an average. Any help would be Greatly appreciated!!! I apologize edited in is the code from bs4 import BeautifulSoup from lxml import etree import requests URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709" #HEADERS = you'll need to add your own headers here, won't let post. webpage = requests.get(URL, headers=HEADERS) soup = BeautifulSoup(webpage.content, "html.parser") dom = etree.HTML(str(soup)) print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)
You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price. import requests from bs4 import BeautifulSoup as bs from pprint import pprint r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'}) soup = bs(r.text, features="lxml") prices = {} for i in soup.select('.item-container'): if a:=i.select_one('.price-current-num'): a.decompose() prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1] pprint(prices) prices as list of floats import requests, re from bs4 import BeautifulSoup as bs from pprint import pprint r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'}) soup = bs(r.text, features="lxml") prices = [] for i in soup.select('.item-container'): if a:=i.select_one('.price-current-num'): a.decompose() prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1]))) pprint(prices)
How to scrape the yahoo earnings calendar with beautifulsoup
How can I scrape the yahoo earnings calendar to pull out the dates? This is for python 3. from bs4 import BeautifulSoup as soup import urllib url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm' response = urllib.request.urlopen(url) html = response.read() page_soup = soup(html,'lxml') table = page_soup.find('p') print(table) the output is "None"
Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation from bs4 import BeautifulSoup as soup import urllib.request url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm' response = urllib.request.urlopen(url) html = response.read() page_soup = soup(html,'lxml') table = page_soup.find_all('td') Dates = [] for something in table: try: if something['aria-label'] == "Earnings Date": Dates.append(something.text) except: print('') print(Dates)
Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines: import pandas as pd earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]
Here are two succinct ways import requests from bs4 import BeautifulSoup as bs r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1') soup = bs(r.content, 'lxml') # using attribute = value selector dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')] #using nth-of-type to get column dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]
Get a <span> value using python web scrape
I am trying to get a product price using BeautifulSoup in python. But i keep getting erroes, no matter what I try. The picture of the site i am trying to web scrape I want to get the 19,90 value. I have already done a code to get all the product names, and now need their prices. import requests from bs4 import BeautifulSoup url = 'https://www.zattini.com.br/busca?nsCat=Natural&q=amaro&searchTermCapitalized=Amaro&page=1' page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') price = soup.find('span', itemprop_='price') print(price)
Less ideal is parsing out the JSON containing the prices import requests import json import pandas as pd from bs4 import BeautifulSoup url = 'https://www.zattini.com.br/busca?nsCat=Natural&q=amaro&searchTermCapitalized=Amaro&page=1' page = requests.get(url) soup = BeautifulSoup(page.content, 'lxml') scripts = [script.text for script in soup.select('script') if 'var freedom = freedom ||' in script.text] pricesJson = scripts[0].split('"items":')[1].split(']')[0] + ']' prices = [item['price'] for item in json.loads(pricesJson)] names = [name.text for name in soup.select('#item-list [itemprop=name]')] results = list(zip(names,prices)) df = pd.DataFrame(results) print(df) Sample output:
span[itemprop='price'] is generated by javascript. Original value stored in div[data-final-price] with value like 1990 and you can format it to 19,90 with Regex. import re ... soup = BeautifulSoup(page.text, 'html.parser') prices = soup.select('div[data-final-price]') for price in prices: price = re.sub(r'(\d\d$)', r',\1', price['data-final-price']) print(price) Results: 19,90 134,89 29,90 119,90 104,90 59,90 ....
Beautiful Soup PYTHON - inside tags
Little problem with BeautifulSoup: from bs4 import BeautifulSoup import requests link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag" req = requests.get(link) web = req.text soup = BeautifulSoup(web, "lxml") cve_name = [] cve_link = [] for par_ in soup.find_all('div', attrs={'class':'fl'}): for link_ in par_.find_all('p'): for text_ in link_.find_all('a'): print (text_.string) print (text_['href']) print ("==========") #cve_name.append(text_.string) #cve_link.append(text_['href']) And it gives me twice records :V That probably is easy to solve :V
The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}): Full code: from bs4 import BeautifulSoup import requests link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag" req = requests.get(link) web = req.text soup = BeautifulSoup(web, "lxml") cve_name = [] cve_link = [] for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}): print(len(par_)) for link_ in par_.find_all('p'): for text_ in link_.find_all('a'): print (text_.string) print (text_['href']) print ("==========") #cve_name.append(text_.string) #cve_link.append(text_['href'])
How about this. I used css selectors to do the same. from bs4 import BeautifulSoup from urllib.parse import urljoin import requests link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag" res = requests.get(link) soup = BeautifulSoup(res.text, "lxml") for item in soup.select('.fl p a'): print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href']))) Partial Output: Item: CNNVD-201712-811 Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811 Item: CNNVD-201712-810 Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810 Item: CNNVD-201712-809 Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809
web scraping with beautiful soup
I have the following code to extract the latest MS office version for mac: import urllib2 from bs4 import BeautifulSoup quote_page = 'https://support.office.com/en-us/article/Update-history- for-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current' page = urllib2.urlopen(quote_page) soup = BeautifulSoup(page, 'html.parser') name_box = soup.find('p', attrs={'class': 'x-hidden-focus'}) print name_box I'm trying to scrape Office 2016 for Mac (all applications) 15.39.0 I'm getting None as the output. any help is appreciated. thank you.
This works, explanation is given in comments. import requests import bs4 url = 'https://support.office.com/en-us/article/Update-history-for-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current' table_id = 'tblID0EAGAAA' resp= requests.get(url) soup = bs4.BeautifulSoup(resp.text, 'lxml') # find table that contains data of interest table = soup.find('table', {'id' : table_id}) # get the second row in that table second_row = table.findAll('tr')[1] # get the second column in that row second_column = second_row.findAll('td')[1] # get the content in this cell version = second_column.find('p').text print(version)
A solution that doesn't depend on the table id (which very well could change after every release) or the ordering of the rows: from bs4 import BeautifulSoup import requests import re page = requests.get('https://support.office.com/en-us/article/Update-history-or-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current') pattern = re.compile(r'^Office.+Mac.*') version = BeautifulSoup(page.content, 'html.parser') \ .select_one('section.ocpSection table tbody') \ .find('p', text=pattern) \ .parent \ .find_next_sibling('td') \ .select_one('p') \ .text print(version)