web scraping with beautiful soup - python
I have the following code to extract the latest MS office version for mac:
import urllib2
from bs4 import BeautifulSoup
quote_page = 'https://support.office.com/en-us/article/Update-history-
for-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('p', attrs={'class': 'x-hidden-focus'})
print name_box
I'm trying to scrape Office 2016 for Mac (all applications)
15.39.0
I'm getting None as the output.
any help is appreciated. thank you.
This works, explanation is given in comments.
import requests
import bs4
url = 'https://support.office.com/en-us/article/Update-history-for-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current'
table_id = 'tblID0EAGAAA'
resp= requests.get(url)
soup = bs4.BeautifulSoup(resp.text, 'lxml')
# find table that contains data of interest
table = soup.find('table', {'id' : table_id})
# get the second row in that table
second_row = table.findAll('tr')[1]
# get the second column in that row
second_column = second_row.findAll('td')[1]
# get the content in this cell
version = second_column.find('p').text
print(version)
A solution that doesn't depend on the table id (which very well could change after every release) or the ordering of the rows:
from bs4 import BeautifulSoup
import requests
import re
page = requests.get('https://support.office.com/en-us/article/Update-history-or-Office-2016-for-Mac-700cab62-0d67-4f23-947b-3686cb1a8eb7#bkmk_current')
pattern = re.compile(r'^Office.+Mac.*')
version = BeautifulSoup(page.content, 'html.parser') \
.select_one('section.ocpSection table tbody') \
.find('p', text=pattern) \
.parent \
.find_next_sibling('td') \
.select_one('p') \
.text
print(version)
Related
How to scrape the yahoo earnings calendar with beautifulsoup
How can I scrape the yahoo earnings calendar to pull out the dates? This is for python 3. from bs4 import BeautifulSoup as soup import urllib url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm' response = urllib.request.urlopen(url) html = response.read() page_soup = soup(html,'lxml') table = page_soup.find('p') print(table) the output is "None"
Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation from bs4 import BeautifulSoup as soup import urllib.request url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm' response = urllib.request.urlopen(url) html = response.read() page_soup = soup(html,'lxml') table = page_soup.find_all('td') Dates = [] for something in table: try: if something['aria-label'] == "Earnings Date": Dates.append(something.text) except: print('') print(Dates)
Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines: import pandas as pd earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]
Here are two succinct ways import requests from bs4 import BeautifulSoup as bs r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1') soup = bs(r.content, 'lxml') # using attribute = value selector dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')] #using nth-of-type to get column dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]
Get value from web link
I have a url from where I want to extract the line having data as "Underlying Stock: NCC 96.70 As on Jun 06, 2019 10:12:20 IST" and extract the Symbol which is "NCC" and Underlying Price is "96.70" into a list. url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
You can make a request to the site and then parse the result with Beautiful Soup. Try this: from bs4 import BeautifulSoup import requests url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17" res = requests.get(url) soup = BeautifulSoup(res.text) # hacky way of finding and parsing the stock data soup.get_text().split("Underlying Stock")[1][2:10].split(" ") This prints out: ['NCC', '96.9'] PS: If you get a warning about lxml... It is the default parser given that you have installed it. Change this line then: soup = BeautifulSoup(res.text, features="lxml"). You need to have lxml installed e.g. with pip install lxml in your environment.
Another version, less hacky. url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17" page_html = requests.get(url).text page_soup = BeautifulSoup(page_html, "html.parser") page_soup.find("b").next.split(' ')
A succinct way is to select for the first right aligned table cell (td[align=right]) ; which you can actually simplify to just the attribute, [align=right]: from bs4 import BeautifulSoup as bs import requests r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17') soup = bs(r.content, 'lxml') headline = soup.select_one('[align=right]').text.strip().replace('\xa0\n',' ') print(headline) You can also take first row of first table from bs4 import BeautifulSoup import requests r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17') soup = bs(r.content, 'lxml') table = soup.select_one('table') headline = table.select_one('tr:nth-of-type(1)').text.replace('\n',' ').replace('\xa0', ' ').strip() print(headline)
from bs4 import BeautifulSoup import requests url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17" res = requests.get(url) soup = BeautifulSoup(res.text, "lxml") # hacky way of finding and parsing the stock data mylist = soup.get_text().split("Underlying Stock")[1][2:10].split(" ") print(mylist[:2]) ============================= import pandas as pd dict1 = {'SYMBOL': ['ACC','ADANIENT','ADANIPORTS','ADANIPOWER','AJANTPHARM','ALBK','AMARAJABAT','AMBUJACEM','APOLLOHOSP','APOLLOTYRE','ARVIND','ASHOKLEY','ASIANPAINT','AUROPHARMA','AXISBANK','BAJAJ-AUTO','BAJAJFINSV','BAJFINANCE','BALKRISIND','BANKBARODA','BANKINDIA','BANKNIFTY','BATAINDIA','BEL','BEML','BERGEPAINT','BHARATFIN','BHARATFORG','BHARTIARTL','BHEL','BIOCON','BOSCHLTD','BPCL','BRITANNIA','BSOFT','CADILAHC','CANBK','CANFINHOME','CASTROLIND','CEATLTD','CENTURYTEX','CESC','CGPOWER','CHENNPETRO','CHOLAFIN','CIPLA','COALINDIA','COLPAL','CONCOR','CUMMINSIND','DABUR','DCBBANK','DHFL','DISHTV','DIVISLAB','DLF','DRREDDY','EICHERMOT','ENGINERSIN','EQUITAS','ESCORTS','EXIDEIND','FEDERALBNK','GAIL','GLENMARK','GMRINFRA','GODFRYPHLP','GODREJCP','GODREJIND','GRASIM','GSFC','HAVELLS','HCLTECH','HDFC','HDFCBANK','HEROMOTOCO','HEXAWARE','HINDALCO','HINDPETRO','HINDUNILVR','HINDZINC','IBULHSGFIN','ICICIBANK','ICICIPRULI','IDBI','IDEA','IDFC','IDFCFIRSTB','IFCI','IGL','INDIACEM','INDIANB','INDIGO','INDUSINDBK','INFIBEAM','INFRATEL','INFY','IOC','IRB','ITC','JETAIRWAYS','JINDALSTEL','JISLJALEQS','JSWSTEEL','JUBLFOOD','JUSTDIAL','KAJARIACER','KOTAKBANK','KSCL','KTKBANK','L&TFH','LICHSGFIN','LT','LUPIN','M&M','M&MFIN','MANAPPURAM','MARICO','MARUTI','MCDOWELL-N','MCX','MFSL','MGL','MINDTREE','MOTHERSUMI','MRF','MRPL','MUTHOOTFIN','NATIONALUM','NBCC','NCC','NESTLEIND','NHPC','NIFTY','NIFTYIT','NIITTECH','NMDC','NTPC','OFSS','OIL','ONGC','ORIENTBANK','PAGEIND','PCJEWELLER','PEL','PETRONET','PFC','PIDILITIND','PNB','POWERGRID','PVR','RAMCOCEM','RAYMOND','RBLBANK','RECLTD','RELCAPITAL','RELIANCE','RELINFRA','REPCOHOME','RPOWER','SAIL','SBIN','SHREECEM','SIEMENS','SOUTHBANK','SRF','SRTRANSFIN','STAR','SUNPHARMA','SUNTV','SUZLON','SYNDIBANK','TATACHEM','TATACOMM','TATAELXSI','TATAGLOBAL','TATAMOTORS','TATAMTRDVR','TATAPOWER','TATASTEEL','TCS','TECHM','TITAN','TORNTPHARM','TORNTPOWER','TV18BRDCST','TVSMOTOR','UBL','UJJIVAN','ULTRACEMCO','UNIONBANK','UPL','VEDL','VGUARD','VOLTAS','WIPRO','WOCKPHARMA','YESBANK','ZEEL'], 'LOT_SIZE': [400,4000,2500,20000,500,13000,700,2500,500,3000,2000,4000,600,1000,1200,250,125,250,800,4000,6000,20,550,6000,700,2200,500,1200,1851,7500,900,30,1800,200,2250,1600,2000,1800,3400,400,600,550,12000,1800,500,1000,2200,700,1563,700,1250,4500,1500,8000,400,2600,250,25,4100,4000,1100,2000,7000,2667,1000,45000,700,600,1500,750,4700,1000,700,500,250,200,1500,3500,2100,300,3200,500,1375,1500,10000,19868,13200,12000,35000,2750,4500,2000,600,300,4000,2000,1200,3500,3200,2400,2200,2250,9000,1500,500,1400,1300,400,1500,4700,4500,1100,375,700,1000,1250,6000,2600,75,1250,700,1200,600,600,2850,10,7000,1500,8000,8000,8000,50,27000,75,50,750,6000,4800,150,3399,3750,7000,25,6500,302,3000,6200,500,7000,4000,400,800,800,1200,6000,1500,500,1300,1100,16000,12000,3000,50,550,33141,250,600,1100,1100,1000,76000,15000,750,1000,400,2250,2000,3800,9000,1061,250,1200,750,500,3000,13000,1000,700,1600,200,7000,600,2300,3000,1000,3200,900,1750,1300]} df1 = pd.DataFrame(dict1) dict2 = {'SYMBOL': ['INFY', 'TATAMOTORS', 'IDBI', 'BHEL', 'LT'], 'LTP': ['55', '66', '77', '88', '99'], 'PRICE': ['0.25', '0.36', '0.12', '0.28', '0.85']} df2 = pd.DataFrame(dict2) print(df1,'\n\n') print(df2,'\n\n') df2['LOT_SIZE']=df2[['SYMBOL']].merge(df1,how='left').LOT_SIZE print(df2)
Get a <span> value using python web scrape
I am trying to get a product price using BeautifulSoup in python. But i keep getting erroes, no matter what I try. The picture of the site i am trying to web scrape I want to get the 19,90 value. I have already done a code to get all the product names, and now need their prices. import requests from bs4 import BeautifulSoup url = 'https://www.zattini.com.br/busca?nsCat=Natural&q=amaro&searchTermCapitalized=Amaro&page=1' page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') price = soup.find('span', itemprop_='price') print(price)
Less ideal is parsing out the JSON containing the prices import requests import json import pandas as pd from bs4 import BeautifulSoup url = 'https://www.zattini.com.br/busca?nsCat=Natural&q=amaro&searchTermCapitalized=Amaro&page=1' page = requests.get(url) soup = BeautifulSoup(page.content, 'lxml') scripts = [script.text for script in soup.select('script') if 'var freedom = freedom ||' in script.text] pricesJson = scripts[0].split('"items":')[1].split(']')[0] + ']' prices = [item['price'] for item in json.loads(pricesJson)] names = [name.text for name in soup.select('#item-list [itemprop=name]')] results = list(zip(names,prices)) df = pd.DataFrame(results) print(df) Sample output:
span[itemprop='price'] is generated by javascript. Original value stored in div[data-final-price] with value like 1990 and you can format it to 19,90 with Regex. import re ... soup = BeautifulSoup(page.text, 'html.parser') prices = soup.select('div[data-final-price]') for price in prices: price = re.sub(r'(\d\d$)', r',\1', price['data-final-price']) print(price) Results: 19,90 134,89 29,90 119,90 104,90 59,90 ....
Beautifulsoup scraping table from website with requests for pandas
I am trying to download the data on this website https://coinmunity.co/ ...in order to manipulate later it in Python or Pandas I have tried to do it directly to Pandas via Requests, but did not work, using this code: res = requests.get("https://coinmunity.co/") soup = BeautifulSoup(res.content, 'lxml') table = soup.find_all('table')[0] dfm = pd.read_html(str(table), header = 0) dfm = dfm[0].dropna(axis=0, thresh=4) dfm.head() In most of the things I tried, I could only get to the info in the headers, which seems to be the only table seen in this page by the code. Seeing that this did not work, I tried to do the same scraping with Requests and BeautifulSoup, but it did not work either. This is my code: import requests from bs4 import BeautifulSoup res = requests.get("https://coinmunity.co/") soup = BeautifulSoup(res.content, 'lxml') #table = soup.find_all('table')[0] #table = soup.find_all('div', {'class':'inner-container'}) #table = soup.find_all('tbody', {'class':'_ngcontent-c0'}) #table = soup.find_all('table')[0].findAll('tr') #table = soup.find_all('table')[0].find('tbody')#.find_all('tbody _ngcontent-c3=""') table = soup.find_all('p', {'class':'stats change positiveSubscribers'}) You can see in the lines commented, all the things I have tried, but nothing worked. Is there any way to easily download that table to use it on Pandas/Python, in the tidiest, easier and quickest possible way? Thank you
Since the content is loaded dynamically after the initial request is made, you won't be able to scrape this data with request. Here's what I would do instead: from selenium import webdriver import pandas as pd import time from bs4 import BeautifulSoup driver = webdriver.Firefox() driver.implicitly_wait(10) driver.get("https://coinmunity.co/") html = driver.page_source.encode('utf-8') soup = BeautifulSoup(html, 'lxml') results = [] for row in soup.find_all('tr')[2:]: data = row.find_all('td') name = data[1].find('a').text value = data[2].find('p').text # get the rest of the data you need about each coin here, then add it to the dictionary that you append to results results.append({'name':name, 'value':value}) df = pd.DataFrame(results) df.head() name value 0 NULS 14,005 1 VEN 84,486 2 EDO 20,052 3 CLUB 1,996 4 HSR 8,433 You will need to make sure that geckodriver is installed and that it is in your PATH. I just scraped the name of each coin and the value but getting the rest of the information should be easy.
Unable to scrape name from google finance
I want to scrape name, url and description of companies as listed on google finance. So far I am successful in getting description and url but unable to fetch the name. In the source code of myUrl, name is 024 Pharma Inc. When I see the div, the class is named 'appbar-snippet-primary'. But still the code doesn't find it. I ma new to web scraping so may be I am missing something. Please guide me in this regard. from bs4 import BeautifulSoup import urllib import csv myUrl = 'https://www.google.com/finance?q=OTCMKTS%3AEEIG' r = urllib.urlopen(myUrl).read() soup = BeautifulSoup(r, 'html.parser') name_box = soup.find('div', class_='appbar-snippet-primary') # !! This div is not found #name = name_box.text #print name description = soup.find('div', class_='companySummary') desc = description.text.strip() #print desc website = soup.find('div', class_='item') site = website.text #print site
from bs4 import BeautifulSoup import requests myUrl = 'https://www.google.com/finance?q=OTCMKTS%3AEEIG' r = requests.get(myUrl).content soup = BeautifulSoup(r, 'html.parser') name = soup.find('title').text.split(':')[0] # !! This div is not found #print name description = soup.find('div', class_='companySummary') desc = description.text.strip() #print desc website = soup.find('div', class_='item') site = website.text
write soup.find_all() instead of soup.find()