Python BeautifulSoup Access Div container - python

I am trying to use BeautifulSoup to grab the container from below product detail page that contains brand, product name, price etc.
According to chrome site-inspection it is a "div" container from the class "product-detail__info" (please see screenshot)
Unfortunately my code does work...
I would appreciate if someone could give me a tip :)
Thanks in advance
Link: https://www.nemlig.com/opvasketabs-all-in-one-5039333
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://www.nemlig.com/opvasketabs-all-in-one-5039333"
#Opening connection and grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
#Closing connection
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs product detail container
container = page_soup.find_all("div", {"class": "product-detail__info"})
print(container)

The data that you are looking for is part of the source page (as a script).
Here is the code that will return it to you:
import requests
from bs4 import BeautifulSoup as soup
import json
r = requests.get('https://www.nemlig.com/opvasketabs-all-in-one-5039333')
if r.status_code == 200:
soup = soup(r.text, "html.parser")
scripts = soup.find_all("script")
data = json.loads(scripts[6].next.strip()[:-1])
print(data)
output
[{'#context': 'http://schema.org/', '#type': 'Organization', 'url': 'https://www.nemlig.com/', 'logo': 'https://www.nemlig.com/https://live.nemligstatic.com/s/b1.0.7272.30289/scom/dist/images/logos/nemlig-web-logo_tagline_rgb.svg', 'contactPoint': [{'#type': 'ContactPoint', 'telephone': '+45 70 33 72 33', 'contactType': 'customer service'}], 'sameAs': ['https://www.facebook.com/nemligcom/', 'https://www.instagram.com/nemligcom/', 'https://www.linkedin.com/company/nemlig-com']}, {'#context': 'http://schema.org/', '#type': 'Product', 'name': 'Opvasketabs all in one', 'brand': 'Ecover', 'image': 'https://live.nemligstatic.com/scommerce/images/opvasketabs-all-in-one.jpg?i=ZowWdq-y/5039333', 'description': '25 stk. / zero / Ecover', 'category': 'Maskinopvask', 'url': 'https://www.nemlig.com/opvasketabs-all-in-one-5039333', 'offers': {'#type': 'Offer', 'priceCurrency': 'DKK', 'price': '44.95'}}]

Related

How do use the soup.find, soup.find_all

Here is my code and the output
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.jobberman.com/jobs")
soup = BeautifulSoup(res.text, "html.parser")
job = soup.find("div", class_ = "relative inline-flex flex-col w-full text-sm font-normal pt-2")
company_name = job.find('a[href*="jobs"]')
print(company_name)
output is none
None
But when i use the select method, i got the desired result but cant use .text on it
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.jobberman.com/jobs")
soup = BeautifulSoup(res.text, "html.parser")
job = soup.find("div", class_ = "relative inline-flex flex-col w-full text-sm font-normal pt-2")
company_name = job.select('a[href*="jobs"]').text
print(company_name)
output
AttributeError: ResultSet object has no attribute 'text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Change your selection strategy - Cause main issue here is, that not all company names are linked:
job.find('div',{'class':'search-result__job-meta'}).text.strip()
or
job.select_one('.search-result__job-meta').text.strip()
Example
Also store your information in a structured way for post processing:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.jobberman.com/jobs")
soup = BeautifulSoup(res.text, "html.parser")
data = []
for job in soup.select('div:has(>.search-result__body)'):
data.append({
'job':job.h3.text,
'company':job.select_one('.search-result__job-meta').text.strip()
})
data
Output
[{'job': 'Restaurant Manager', 'company': 'Balkaan Employments service'},
{'job': 'Executive Assistant', 'company': 'Nolla Fresh & Frozen ltd'},
{'job': 'Portfolio Manager/Instructor 1', 'company': 'Fun Science World'},
{'job': 'Microbiologist', 'company': "NEIMETH INT'L PHARMACEUTICALS PLC"},
{'job': 'Data Entry Officer', 'company': 'Nkoyo Pharmaceuticals Ltd.'},
{'job': 'Chemical Analyst', 'company': "NEIMETH INT'L PHARMACEUTICALS PLC"},
{'job': 'Senior Front-End Engineer', 'company': 'Salvo Agency'},...]
The problems with your search strategy has been covered by comments and answers posted earlier. I am offering a solution for your problem which involves the use of regex library, along with the find_all() function call:
import requests
from bs4 import BeautifulSoup
import re
res = requests.get("https://www.jobberman.com/jobs")
soup = BeautifulSoup(res.text, "html.parser")
company_name = soup.find_all("a", href=re.compile("/jobs\?"), rel="nofollow")
for i in range(len(company_name)):
print(company_name[i].text)
Output:
GRATIAS DEI NIGERIA LIMITED
Balkaan Employments service
Fun Science World
NEIMETH INT'L PHARMACEUTICALS PLC
Nkoyo Pharmaceuticals Ltd.
...

How to extract title inside span h5 a href link using BeautifulSoup

I'm trying to extract the title of a link using BeautifulSoup. The code that I'm working with is as follows:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
hdr={'User-Agent':'Chrome/84.0.4147.135'}
frame=[]
for page_number in range(19):
http= "https://www.epa.wa.gov.au/media-statements?page={}".format(page_number+1)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.content, 'html.parser')
for row in soup.select('.view-content .views-row'):
content = row.select_one('.views-field-body').get_text(strip=True)
title = row.text.strip(':')
link = 'https://www.epa.wa.gov.au' + row.a['href']
date = row.select_one('.date-display-single').get_text(strip=True)
frame.append({
'title': title,
'link': link,
'date': date,
'content': content
})
dfs = pd.DataFrame(frame)
dfs.to_csv('epa_scrapper.csv',index=False,encoding='utf-8-sig')
However, nothing gets displayed after I run the above code. How can I extract the value stored inside the title attribute of the anchor tag stored in link?
Also, I just want to know how can I get append "title", "link", "dt", "content" into a csv file.
Thank you so much in advance.
To get the link text, you can use selector "h5 a". For example:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
hdr={'User-Agent':'Chrome/84.0.4147.135'}
frame=[]
for page_number in range(1, 20):
http= "https://www.epa.wa.gov.au/media-statements?page={}".format(page_number)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.content, 'html.parser')
for row in soup.select('.view-content .views-row'):
content = row.select_one('.views-field-body').get_text(strip=True, separator='\n')
title = row.select_one('h5 a').get_text(strip=True)
link = 'https://www.epa.wa.gov.au' + row.a['href']
date = row.select_one('.date-display-single').get_text(strip=True)
frame.append({
'title': title,
'link': link,
'date': date,
'content': content
})
dfs = pd.DataFrame(frame)
dfs.to_csv('epa_scrapper.csv',index=False,encoding='utf-8-sig')
Creates epa_scrapper.csv (screenshot from LibreOffice):

python stock price using BeautifulSoup

I am trying to get the price of the stock using the below code, it returns null for the current price. Please let me know where I am making a error
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://finance.yahoo.com/quote/MMM/key-statistics?p=MMM'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup =soup(page_html,"lxml")
uClient.close()
# I tried this option 1
currentPrice = page_soup.find('div',attrs={"span": "Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)"})
print(currentPrice)
# I tried this option 2
for currentPrice in page_soup.find("div",{"class": "D(ib) Mend(20px)"}) :
print (page_soup.span)
You might want to have a look at yfinance
https://pypi.org/project/yfinance/

Get value from web link

I have a url from where I want to extract the line having data as "Underlying Stock: NCC 96.70 As on Jun 06, 2019 10:12:20 IST" and extract the Symbol which is "NCC" and Underlying Price is "96.70" into a list.
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
You can make a request to the site and then parse the result with Beautiful Soup.
Try this:
from bs4 import BeautifulSoup
import requests
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
res = requests.get(url)
soup = BeautifulSoup(res.text)
# hacky way of finding and parsing the stock data
soup.get_text().split("Underlying Stock")[1][2:10].split(" ")
This prints out:
['NCC', '96.9']
PS: If you get a warning about lxml... It is the default parser given that you have installed it. Change this line then: soup = BeautifulSoup(res.text, features="lxml"). You need to have lxml installed e.g. with pip install lxml in your environment.
Another version, less hacky.
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
page_html = requests.get(url).text
page_soup = BeautifulSoup(page_html, "html.parser")
page_soup.find("b").next.split(' ')
A succinct way is to select for the first right aligned table cell (td[align=right]) ; which you can actually simplify to just the attribute, [align=right]:
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17')
soup = bs(r.content, 'lxml')
headline = soup.select_one('[align=right]').text.strip().replace('\xa0\n',' ')
print(headline)
You can also take first row of first table
from bs4 import BeautifulSoup
import requests
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17')
soup = bs(r.content, 'lxml')
table = soup.select_one('table')
headline = table.select_one('tr:nth-of-type(1)').text.replace('\n',' ').replace('\xa0', ' ').strip()
print(headline)
from bs4 import BeautifulSoup
import requests
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
res = requests.get(url)
soup = BeautifulSoup(res.text, "lxml")
# hacky way of finding and parsing the stock data
mylist = soup.get_text().split("Underlying Stock")[1][2:10].split(" ")
print(mylist[:2])
=============================
import pandas as pd
dict1 = {'SYMBOL': ['ACC','ADANIENT','ADANIPORTS','ADANIPOWER','AJANTPHARM','ALBK','AMARAJABAT','AMBUJACEM','APOLLOHOSP','APOLLOTYRE','ARVIND','ASHOKLEY','ASIANPAINT','AUROPHARMA','AXISBANK','BAJAJ-AUTO','BAJAJFINSV','BAJFINANCE','BALKRISIND','BANKBARODA','BANKINDIA','BANKNIFTY','BATAINDIA','BEL','BEML','BERGEPAINT','BHARATFIN','BHARATFORG','BHARTIARTL','BHEL','BIOCON','BOSCHLTD','BPCL','BRITANNIA','BSOFT','CADILAHC','CANBK','CANFINHOME','CASTROLIND','CEATLTD','CENTURYTEX','CESC','CGPOWER','CHENNPETRO','CHOLAFIN','CIPLA','COALINDIA','COLPAL','CONCOR','CUMMINSIND','DABUR','DCBBANK','DHFL','DISHTV','DIVISLAB','DLF','DRREDDY','EICHERMOT','ENGINERSIN','EQUITAS','ESCORTS','EXIDEIND','FEDERALBNK','GAIL','GLENMARK','GMRINFRA','GODFRYPHLP','GODREJCP','GODREJIND','GRASIM','GSFC','HAVELLS','HCLTECH','HDFC','HDFCBANK','HEROMOTOCO','HEXAWARE','HINDALCO','HINDPETRO','HINDUNILVR','HINDZINC','IBULHSGFIN','ICICIBANK','ICICIPRULI','IDBI','IDEA','IDFC','IDFCFIRSTB','IFCI','IGL','INDIACEM','INDIANB','INDIGO','INDUSINDBK','INFIBEAM','INFRATEL','INFY','IOC','IRB','ITC','JETAIRWAYS','JINDALSTEL','JISLJALEQS','JSWSTEEL','JUBLFOOD','JUSTDIAL','KAJARIACER','KOTAKBANK','KSCL','KTKBANK','L&TFH','LICHSGFIN','LT','LUPIN','M&M','M&MFIN','MANAPPURAM','MARICO','MARUTI','MCDOWELL-N','MCX','MFSL','MGL','MINDTREE','MOTHERSUMI','MRF','MRPL','MUTHOOTFIN','NATIONALUM','NBCC','NCC','NESTLEIND','NHPC','NIFTY','NIFTYIT','NIITTECH','NMDC','NTPC','OFSS','OIL','ONGC','ORIENTBANK','PAGEIND','PCJEWELLER','PEL','PETRONET','PFC','PIDILITIND','PNB','POWERGRID','PVR','RAMCOCEM','RAYMOND','RBLBANK','RECLTD','RELCAPITAL','RELIANCE','RELINFRA','REPCOHOME','RPOWER','SAIL','SBIN','SHREECEM','SIEMENS','SOUTHBANK','SRF','SRTRANSFIN','STAR','SUNPHARMA','SUNTV','SUZLON','SYNDIBANK','TATACHEM','TATACOMM','TATAELXSI','TATAGLOBAL','TATAMOTORS','TATAMTRDVR','TATAPOWER','TATASTEEL','TCS','TECHM','TITAN','TORNTPHARM','TORNTPOWER','TV18BRDCST','TVSMOTOR','UBL','UJJIVAN','ULTRACEMCO','UNIONBANK','UPL','VEDL','VGUARD','VOLTAS','WIPRO','WOCKPHARMA','YESBANK','ZEEL'],
'LOT_SIZE': [400,4000,2500,20000,500,13000,700,2500,500,3000,2000,4000,600,1000,1200,250,125,250,800,4000,6000,20,550,6000,700,2200,500,1200,1851,7500,900,30,1800,200,2250,1600,2000,1800,3400,400,600,550,12000,1800,500,1000,2200,700,1563,700,1250,4500,1500,8000,400,2600,250,25,4100,4000,1100,2000,7000,2667,1000,45000,700,600,1500,750,4700,1000,700,500,250,200,1500,3500,2100,300,3200,500,1375,1500,10000,19868,13200,12000,35000,2750,4500,2000,600,300,4000,2000,1200,3500,3200,2400,2200,2250,9000,1500,500,1400,1300,400,1500,4700,4500,1100,375,700,1000,1250,6000,2600,75,1250,700,1200,600,600,2850,10,7000,1500,8000,8000,8000,50,27000,75,50,750,6000,4800,150,3399,3750,7000,25,6500,302,3000,6200,500,7000,4000,400,800,800,1200,6000,1500,500,1300,1100,16000,12000,3000,50,550,33141,250,600,1100,1100,1000,76000,15000,750,1000,400,2250,2000,3800,9000,1061,250,1200,750,500,3000,13000,1000,700,1600,200,7000,600,2300,3000,1000,3200,900,1750,1300]}
df1 = pd.DataFrame(dict1)
dict2 = {'SYMBOL': ['INFY', 'TATAMOTORS', 'IDBI', 'BHEL', 'LT'],
'LTP': ['55', '66', '77', '88', '99'],
'PRICE': ['0.25', '0.36', '0.12', '0.28', '0.85']}
df2 = pd.DataFrame(dict2)
print(df1,'\n\n')
print(df2,'\n\n')
df2['LOT_SIZE']=df2[['SYMBOL']].merge(df1,how='left').LOT_SIZE
print(df2)

how to extract value from website with python?

I'm working on a price web scraping code to extract data such as prices, quantities and so on. I used the code with BeautifulSoup to extract text from html tag of the targeted website.
However, it returned with a bunch of text and I don't know how to split or separate them before printing those into csv file.
from urllib import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.digikey.com/products/en?keywords=568-3651-5-ND'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.body.script
filename = "products.txt"
f = open(filename, "w")
headers = "data"
f.write(headers)
data = containers.text
f.write(data + "\n")
f.close()
and it returned with:
var utag_data = {
page_site: 'US',
page_language: 'en',
wt_use_udo: 'True',
page_content_group: 'Part Search',
page_content_sub_group: 'Part Detail',
page_title: 'Part Detail',
page_type: 'PS',
page_sub_type: 'PD',
page_id: 'PD',
**pn_sku: '1740-1017-ND',**
part_id: '1154763',
**part_available: '4324',**
transaction_type: 'v',
transaction_quantity: '1',
supplier_id: '1740'
, part_search_filter: 'No Filter'
, **part_search_term: '568-3651-5-ND'**
, part_search_term_ext: '568-3651-5-ND'
, part_search_results_count: 1
, video_source: 'Part Detail'
}
I tried with find and regex but results were not as I expect with quantity (part_available: '4324') and sku_num (pn_sku: '1740-1017-ND'). I am new to python and coding, please leave any comments.
Instead of trying to use regular expression, you'd better to use json. But the given script is not strictly-formatted json. Instead you can use yaml parser.
import yaml
data = yaml.load(containers.text.replace('var utag_data =', '', 1))
with open('products.csv', 'w') as f:
f.write('part,sku\n')
f.write('{},{}\n'.format(data['part_available'], data['pn_sku']))
Result csv file content:
part,sku
4324,1740-1017-ND

Categories