Get data from HTML page using python

Get data from HTML page using python - python

I would like to get the value 100 from the tag below using python and beautiful soup
<span style="font-size:90%"><b>100</b> <cite style="color:#cc0000"><b>-0.10</b> (0.52%)</cite></span>
The code below gives me the following output
100 -0.10 (0.52%)
How can I extract only the value 100?
Code:
from urllib.request import Request, urlopen
import bs4
import re
url = 'url.com'
req = Request(url, headers = {'User-Agent': 'Mozilla/5.0'})
page = urlopen(req).read()
soup = bs4.BeautifulSoup(page, 'html.parser')
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.text

You can get the first element of soup.contents:
from bs4 import BeautifulSoup as soup
d = soup(page, 'html.parser').find('span', {'style':'font-size:90%'}).contents[0].text
Output:
'100'

Just Find the <b> tag it will give you 100.
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.find('b').text

Related

lxml to grab All items that share a certain xpath

I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)

You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)

How can I change the code to make it such that the html tags do not appear

from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt)
with the above code, i get the output:
[<*h1 class="celeb-name">Ayden Sng</h1*>] #asterisks added to show h1 tags
What do i need to change in my code or how can i make it such that i only get 'Ayden Sng' as my output?

Iterate over each entry of the txt list and extract its txt property:
txt = [element.text for element in txt] # ['Ayden Sng']
Repl.it

from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt[0].text)
if there are more than one reuslt you can use this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
for i in txt:
print(i.text)

How do I extract the underlined value in red below and save it as a list?

How do I extract the underlined value in red below and save it as a list?
You want to extract the Memcode value in href to a in p tag using soup.
However, I don't know how to extract it at all.
Please help me.
My code
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
print(href)

try this, using css selector
import requests
from bs4 import BeautifulSoup
resp = requests.get('https://www.gjcouncil.go.kr/kr/member/name.do')
soup = BeautifulSoup(resp.text, "html.parser")
for a in soup.select("div[id='member_list'] > ul > li > a"):
print(a['href'].split("/")[2])
08070
00716
08040
....
....

You can use split on the "=" and take the -1 index. I also changed the class .
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
ids = [i['href'].split('=')[-1] for i in soup.select('.btn-home')]

import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
href_list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
if href['href'] == '#LINK':
pass
else:
href_list.append(href['href'][-7:])
print(href_list)
['7620212', '7670126', '7670420', '7650601', '7890930', '7800407', '7660925', '7641102', '7731222', '7801011', '7570803', '7770106', '7590808', '7700831', '7580115', '7710713', '7680112', '7621125', '7711117', '7680213', '7640925', '7591214']

One of the best method is using Regular-Expression.
Check out this code :
import urllib.request
from bs4 import BeautifulSoup
import re
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list_ = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
list_.append(href['href'])
regobj = re.compile(r'memCode=(\w+)')
final = list(filter('#LINK'.__ne__, list_))
result = list(map(lambda i: regobj.search(i).group(1) ,final))
print(result)

data scraping - field value - issue

I like to obtain actual informaction about the numbers of infected from this website: https://www.gov.pl/web/koronawirus/wykaz-zarazen-koronawirusem-sars-cov-2
my code looks like:
import requests
from bs4 import BeautifulSoup
adresURL = 'https://www.gov.pl/web/koronawirus/wykaz-zarazen-koronawirusem-sars-cov-2'
res = requests.get(adresURL)
soup = BeautifulSoup(res.text, 'html.parser')
data = soup.select('.details-property-value')
print(data)
as a result I'm receiving:
[<div class="details-property-value" tabindex="0">{{selectedRecord[commonColumns[index]] || '-'}}</div>]
Any ideas how to get value of fields ? Am i missing sth ?

I'm guessing you're trying to scrape the table on that page. It looks like there is some JSON baked into the HTML:
import requests
from bs4 import BeautifulSoup
import json
url = "https://www.gov.pl/web/koronawirus/wykaz-zarazen-koronawirusem-sars-cov-2"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
data = json.loads(soup.find("pre", {"id": "registerData"}).text)
print(data)

How do i get a specific word phrase out of a word soup with beautiful soup?

I already sorted my code with BeautifulSoup and come out with this:
<bound method Tag.prettify of <script type="text/javascript">var LifeTimeStats = [{"Key":"Top 3","Value":"31"},{"Key":"Top 5s","Value":"36"},{"Key":"Top 3s","Value":"13"},{"Key":"Top 6s","Value":"27"},{"Key":"Top 12s","Value":"76"},{"Key":"Top 25s","Value":"58"},{"Key":"Score","Value":"99,788"},{"Key":"Matches Played","Value":"502"},{"Key":"Wins","Value":"9"},{"Key":"Win%","Value":"2%"},{"Key":"Kills","Value":"730"},{"Key":"K/d","Value":"1.48"}];</script>>
I am trying to get the specific Value "730"
from this :
{"Key":"Kills","Value":"730"}
As there are no HTML tags I can sort by. I have no idea, how to get this specific value. Do you have any idea?
Maybe there is another solution to get there...
Here is the full code:
#----WEB INPUT BASIC----
#import bs4
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
#setting my url
url = 'https://fortnitetracker.com/profile/psn/Rehgum'
#making my https page work
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
urlopen(req).close()
#html parsing
page_soup = soup(webpage, "html.parser")
lifetime = page_soup.findAll("script",{"type":"text/javascript"})
stats = lifetime[3]
specific = stats.prettify
value = specific.text
#from here there is just code to put that value in a .txt file

This is just an idea of what you could do:
Extract the JS code into a Python variable.
Make a regex operation extracting the value of the variable.
"JSONify" such variable value.
Extract the data you need.
As an extract:
a = '''var LifeTimeStats = [{"Key":"Top 3","Value":"31"},{"Key":"Top 5s","Value":"36"},{"Key":"Top 3s","Value":"13"},{"Key":"Top 6s","Value":"27"},{"Key":"Top 12s","Value":"76"},{"Key":"Top 25s","Value":"58"},{"Key":"Score","Value":"99,788"},{"Key":"Matches Played","Value":"502"},{"Key":"Wins","Value":"9"},{"Key":"Win%","Value":"2%"},{"Key":"Kills","Value":"730"},{"Key":"K/d","Value":"1.48"}];'''
b = re.findall(r'var.*?=\s*(.*?);', a)[0]
c = json.loads(b)
See the dummy full code I wrote.
UPDATE
After seeing the full code... This could be a solution for your problem.

I finally got it working!
The thing that produced my errors was the "def loop():" part.
Here is the final working code:
def loop():
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import json
import re
import time
#setting my url
url = 'https://fortnitetracker.com/profile/psn/Rehgum'
#making my https page work
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
urlopen(req).close()
#html parsing
page_soup = soup(webpage, "html.parser")
lifetime = page_soup.findAll("script",{"type":"text/javascript"})
stats = lifetime[3]
stats_var = re.findall(r'var.*?=\s*(.*?);', stats.text)[0]
vals = json.loads(stats_var)
for val in vals:
if val['Key'] == 'Kills':
num_kills = val['Value']
break
print('Num kills = {}'.format(num_kills))
with open('lifetime_wins.txt', 'w') as fd:
fd.write(str(num_kills))
time.sleep(30)
loop()
for i in range(1,2):
loop()
while i<1:
print ("Ende")
Big "Thank you" to #kazbeel. You saved my Day! +rep

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Get data from HTML page using python - python

You can get the first element of soup.contents: from bs4 import BeautifulSoup as soup d = soup(page, 'html.parser').find('span', {'style':'font-size:90%'}).contents[0].text Output: '100'

Just Find the <b> tag it will give you 100. data = soup.find('span',style=re.compile('font-size:90%')) value = data.find('b').text

Related

lxml to grab All items that share a certain xpath

How can I change the code to make it such that the html tags do not appear

How do I extract the underlined value in red below and save it as a list?

data scraping - field value - issue

How do i get a specific word phrase out of a word soup with beautiful soup?

Categories

Resources