I am trying to extract the live price of this stock and storing it as a variable.
This is my code
import re
from urllib.request import Request, urlopen
req = Request("https://poocoin.app/tokens/0xc001bbe2b87079294c63ece98bdd0a88d761434e", headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page = urlopen(req)
html_bytes = page.read()
html = html_bytes.decode("utf-8")
s = "THE HTML BELLOW"
extractTheNumber = re.findall('(\d+?)\s', s)
print(extractTheNumber[0])
This is the site with the price :
https://poocoin.app/tokens/0xc001bbe2b87079294c63ece98bdd0a88d761434e
This is the html for the variable 's':
span class="text-success"
(but add the <> on each end )
I have no clue where to go from here or why it doesnt work.
Related
Hey how can I change this code to enter each page and get the info from this url I want ( the book name and the url of the book )
i wrote ( with google help ) this code but i want to get all the books from all the pages ( 50 pages )
# import web grabbing client and
# HTML parser
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import requests
# variable to store website link as string
booksURL = 'http://books.toscrape.com/'
# grab website and store in variable urlClient
urlClient = uReq(booksURL)
# read and close HTML
page_html = urlClient.read()
urlClient.close()
# call BeautifulSoup for parsing
page_soup = soup(page_html, "html.parser")
# grabs all the products under list tag
bookshelf = page_soup.findAll(
"li", {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})
for books in bookshelf:
# collect title of all books
book_title = books.h3.a["title"]
book_url = books.find("a")["href"]
#books_url = books.h3.a["url"]
print(book_title + "-" +booksURL+book_url)
i tried to add this code but i dont know how to add it to my
for i in range(51): # Number of pages plus one
url = "https://books.toscrape.com/catalogue/page-{}.html".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
This might work. I have removed uReq because I prefer using requests ;)
# import web grabbing client and
# HTML parser
from bs4 import BeautifulSoup as soup
import requests
for i in range(1, 51): # Number of pages plus one
url = "https://books.toscrape.com/catalogue/page-{}.html".format(i)
response = requests.get(url)
# call BeautifulSoup for parsing
page_soup = soup(response.content, "html.parser")
# grabs all the products under list tag
bookshelf = page_soup.findAll(
"li", {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})
for books in bookshelf:
# collect title of all books
book_title = books.h3.a["title"]
book_url = books.find("a")["href"]
print(book_title + " - " + book_url)
i try to scrape a website. But i failed to extract the description of each item. Here is my code:
from bs4 import BeautifulSoup
import requests
url = "http://engine.ddtc.co.id/putusan-pengadilan-pajak"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
puts =soup.find_all("div",{"class":"p3-search-item"})
for put in puts:
title = put.find("div", {"class":"p3-title"}).text
cat = put.find("div", {"class":"p3-category"}).text
date = put.find("div", {"class":"search-result-item-meta"}).text
link = put.find("a").get("href")
put_response = requests.get(link)
put_data = put_response.text
put_soup = BeautifulSoup(put_data, "html.parser")
put_description = put_soup.find("div",{"id": "modal-contents-pp"}).text
print("Judul Putusan:", title, "\nKategori:", cat, "\nTanggal:", date, "\nLink:", link, "\nDescription:", put_description)
So i failed to extract the description.
The description only show blank and few words. The full description can be shown if we click each item's link.
Really appreciate any help.
I think you need to change the put_description field:
from bs4 import BeautifulSoup
import requests
url = "http://engine.ddtc.co.id/putusan-pengadilan-pajak"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
puts =soup.find_all("div",{"class":"p3-search-item"})
for put in puts:
title = put.find("div", {"class":"p3-title"}).text
cat = put.find("div", {"class":"p3-category"}).text
date = put.find("div", {"class":"search-result-item-meta"}).text
link = put.find("a").get("href")
put_response = requests.get(link)
put_data = put_response.text
put_soup = BeautifulSoup(put_data, "html.parser")
put_description = put_soup.find("div",{"class": "p3-desc"}).text
print("Judul Putusan:", title, "\nKategori:", cat, "\nTanggal:", date, "\nLink:", link, "\nDescription:", put_description)
Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()
I already sorted my code with BeautifulSoup and come out with this:
<bound method Tag.prettify of <script type="text/javascript">var LifeTimeStats = [{"Key":"Top 3","Value":"31"},{"Key":"Top 5s","Value":"36"},{"Key":"Top 3s","Value":"13"},{"Key":"Top 6s","Value":"27"},{"Key":"Top 12s","Value":"76"},{"Key":"Top 25s","Value":"58"},{"Key":"Score","Value":"99,788"},{"Key":"Matches Played","Value":"502"},{"Key":"Wins","Value":"9"},{"Key":"Win%","Value":"2%"},{"Key":"Kills","Value":"730"},{"Key":"K/d","Value":"1.48"}];</script>>
I am trying to get the specific Value "730"
from this :
{"Key":"Kills","Value":"730"}
As there are no HTML tags I can sort by. I have no idea, how to get this specific value. Do you have any idea?
Maybe there is another solution to get there...
Here is the full code:
#----WEB INPUT BASIC----
#import bs4
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
#setting my url
url = 'https://fortnitetracker.com/profile/psn/Rehgum'
#making my https page work
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
urlopen(req).close()
#html parsing
page_soup = soup(webpage, "html.parser")
lifetime = page_soup.findAll("script",{"type":"text/javascript"})
stats = lifetime[3]
specific = stats.prettify
value = specific.text
#from here there is just code to put that value in a .txt file
This is just an idea of what you could do:
Extract the JS code into a Python variable.
Make a regex operation extracting the value of the variable.
"JSONify" such variable value.
Extract the data you need.
As an extract:
a = '''var LifeTimeStats = [{"Key":"Top 3","Value":"31"},{"Key":"Top 5s","Value":"36"},{"Key":"Top 3s","Value":"13"},{"Key":"Top 6s","Value":"27"},{"Key":"Top 12s","Value":"76"},{"Key":"Top 25s","Value":"58"},{"Key":"Score","Value":"99,788"},{"Key":"Matches Played","Value":"502"},{"Key":"Wins","Value":"9"},{"Key":"Win%","Value":"2%"},{"Key":"Kills","Value":"730"},{"Key":"K/d","Value":"1.48"}];'''
b = re.findall(r'var.*?=\s*(.*?);', a)[0]
c = json.loads(b)
See the dummy full code I wrote.
UPDATE
After seeing the full code... This could be a solution for your problem.
I finally got it working!
The thing that produced my errors was the "def loop():" part.
Here is the final working code:
def loop():
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import json
import re
import time
#setting my url
url = 'https://fortnitetracker.com/profile/psn/Rehgum'
#making my https page work
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
urlopen(req).close()
#html parsing
page_soup = soup(webpage, "html.parser")
lifetime = page_soup.findAll("script",{"type":"text/javascript"})
stats = lifetime[3]
stats_var = re.findall(r'var.*?=\s*(.*?);', stats.text)[0]
vals = json.loads(stats_var)
for val in vals:
if val['Key'] == 'Kills':
num_kills = val['Value']
break
print('Num kills = {}'.format(num_kills))
with open('lifetime_wins.txt', 'w') as fd:
fd.write(str(num_kills))
time.sleep(30)
loop()
for i in range(1,2):
loop()
while i<1:
print ("Ende")
Big "Thank you" to #kazbeel. You saved my Day! +rep
I'm trying to scrape information on greyhound races. For example, I want to scrape http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena. This page shows all results for the dog Hardwick Serena, but it is split over several pages.
Inspecting the page, it shows under the 'next page' button:
<input type="submit" name="ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl12" value=" " title="Next Page" class="rgPageNext">.
I was hoping for a HTML link, that I could use for the next iteration of the scrape, but no luck.
Further inspection, by looking at network traffic, shows that the browser send a horribly long (hashed?) string for __VIEWSTATE, among others. Likely to protect the database?
I'm looking for a way to scrape all pages of one dog, either by iterating over all pages, or by increasing the page length to show 100+ lines on page 1. The underlying database is .aspx.
I'm using Python 3.5 and BeautifulSoup.
current code:
import requests
from bs4 import BeautifulSoup
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
with requests.session() as s:
s.headers['user-agent'] = 'Mozilla/5.0'
r = s.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
target = 'ctl00$ctl00$mainContent$cmscontent$DogRaceCard$btnFilter_input'
data = { tag['name']: tag['value']
for tag in soup.select('input[name^=ctl00]') if tag.get('value')
}
state = { tag['name']: tag['value']
for tag in soup.select('input[name^=__]')
}
data.update(state)
numberpages = int(str(soup.find('div', 'rgWrap rgInfoPart')).split(' ')[-2].split('>')[1].split('<')[0])
# for page in range(last_page + 1):
for page in range(numberpages):
data['__EVENTTARGET'] = target.format(page)
#data['__VIEWSTATE'] = target.format(page)
print(10)
r = s.post(url, data=data)
soup = BeautifulSoup(r.content, 'html5lib')
tables = soup.findChildren('table')
my_table = tables[9]
rows = my_table.findChildren(['th', 'tr'])
tabel = [[]]
for i in range(len(rows)):
cells = rows[i].findChildren('td')
tabel.append([])
for j in range(len(cells)):
value = cells[j].string
tabel[i].append(value)
table = []
for i in range(len(tabel)):
if len(tabel[i]) == 16:
del tabel[i][-2:]
table.append(tabel[i])
In this case, for each page requested a POST request is issued with form url encoded parameter __EVENTTARGET & __VIEWSTATE :
__VIEWSTATE can be easily extracted from an input tag
__EVENTTARGET is different for each page and the value is passed from a javacript function for each page link so you can extract it with a regex :
<a href="javascript:__doPostBack('ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl07','')">
<span>2</span>
</a>
The python script :
from bs4 import BeautifulSoup
import requests
import re
# extract data from page
def extract_data(soup):
tables = soup.find_all("div", {"class":"race-card"})[0].find_all("tbody")
item_list = [
(
t[0].text.strip(), #date
t[1].text.strip(), #dist
t[2].text.strip(), #TP
t[3].text.strip(), #StmHCP
t[4].text.strip(), #Fin
t[5].text.strip(), #By
t[6].text.strip(), #WinnerOr2nd
t[7].text.strip(), #Venue
t[8].text.strip(), #Remarks
t[9].text.strip(), #WinTime
t[10].text.strip(), #Going
t[11].text.strip(), #SP
t[12].text.strip(), #Class
t[13].text.strip() #CalcTm
)
for t in (t.find_all('td') for t in tables[1].find_all('tr'))
if t
]
print(item_list)
session = requests.Session()
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# get view state value
view_state = soup.find_all("input", {"id":"__VIEWSTATE"})[0]["value"]
# get all event target values
event_target = soup.find_all("div", {"class":"rgNumPart"})[0]
event_target_list = [
re.search('__doPostBack\(\'(.*)\',', t["href"]).group(1)
for t in event_target.find_all('a')
]
# extract data for the 1st page
extract_data(soup)
# extract data for each page except the first
for link in event_target_list[1:]:
print("get page {0}".format(link))
post_data = {
'__EVENTTARGET': link,
'__VIEWSTATE': view_state
}
response = session.post(url, data=post_data)
soup = BeautifulSoup(response.content, "html.parser")
extract_data(soup)