Unable to format the HTML output - python

I have python code that returns an HTML page. Within that page, there is a line "2092 Pittman Road" which is the parcel address. My code is below:
import mechanize
br = mechanize.Browser()
response = br.open("https://www.matsugov.us/myproperty")
for form in br.forms():
if form.attrs.get('name') == 'frmSearch':
br.form = form
break
br.form['ddlType']=["taxid"]
br['txtParm']="218N02W27C003"
req=br.submit().read()
print req
req gives me the o/p in HTML format. You can run this code as is to see the o/p.

Use this code, this will work for you:
from bs4 import BeautifulSoup
import mechanize
br = mechanize.Browser()
response = br.open("https://www.matsugov.us/myproperty")
for form in br.forms():
if form.attrs.get('name') == 'frmSearch':
br.form = form
break
br.form['ddlType']=["taxid"]
br['txtParm']="218N02W27C003"
req=br.submit().read()
soup = BeautifulSoup(req, 'html.parser')
table = soup.find('td', {'class': 'Grid_5'})
for row in table:
print row

Feed your HTML to BeautifulSoup, and then navigate or format it as you wish.

Related

How to scrape a live stock price from a website?

I am trying to extract the live price of this stock and storing it as a variable.
This is my code
import re
from urllib.request import Request, urlopen
req = Request("https://poocoin.app/tokens/0xc001bbe2b87079294c63ece98bdd0a88d761434e", headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page = urlopen(req)
html_bytes = page.read()
html = html_bytes.decode("utf-8")
s = "THE HTML BELLOW"
extractTheNumber = re.findall('(\d+?)\s', s)
print(extractTheNumber[0])
This is the site with the price :
https://poocoin.app/tokens/0xc001bbe2b87079294c63ece98bdd0a88d761434e
This is the html for the variable 's':
span class="text-success"
(but add the <> on each end )
I have no clue where to go from here or why it doesnt work.

How to fetch href link from a website using BeautifulSoup

I am trying to get all the article links in a given website below.
However, my code does not print anything at all although I specified the class id and the path to it.
below is my code.
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://uynaa.wordpress.com/category/%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb/").read()
soup = BeautifulSoup(html, "lxml")
productDivs = soup.findAll('div', attrs={'class' : 'post type-post status-publish format-standard hentry category-56456384'})
for div in productDivs:
print(div.find('h2')[a]['href'])
How do I fetch all the links?
The links are loaded dynamically via JavaScript from external URL. You can use this example to print all links:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
for p in data['postflair']:
print(p)
if data['lastbatch']:
break
page += 1
Prints:
https://uynaa.wordpress.com/2014/01/02/2013-in-review/
https://uynaa.wordpress.com/2013/10/07/%d0%b0%d1%84%d0%b3%d0%b0%d0%bd%d0%b8%d1%81%d1%82%d0%b0%d0%bd-%d0%b0%d0%bd%d1%85%d0%b4%d0%b0%d0%b3%d1%87-%d1%88%d0%b0%d0%bb%d1%82%d0%b3%d0%b0%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d0%b5-%d0%ba%d0%b0%d1%81%d0%bf%d0%b5%d1%80%d1%81%d0%ba%d0%b8%d0%b9-%d0%b1%d0%b8-%d0%b4%d0%b0%d1%80%d0%b0%d0%bd%d0%b3%d1%83%d0%b9%d0%bb%d0%b0%d0%bb-%d1%82%d0%be%d0%b3%d1%82%d0%be%d0%be-%d0%b3%d1%8d/
https://uynaa.wordpress.com/2013/10/07/%d1%88%d0%b0%d0%bd%d1%85%d0%b0%d0%b9-%d0%bd%d0%be%d0%b3%d0%be%d0%be%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d1%8d%d0%bd%d1%8d-%d0%b3%d0%b0%d0%b7%d0%b0%d1%80-%d0%bc%d0%b0%d0%bd%d0%b0%d0%b9%d1%85-%d0%b1%d0%b0%d0%b9%d1%81%d0%b0%d0%bd-%d1%8e%d0%bc/
https://uynaa.wordpress.com/2013/10/07/500-%d0%b6%d0%b8%d0%bb-%d0%b0%d1%80%d1%87%d0%bb%d1%83%d1%83%d0%bb%d0%b0%d0%b0%d0%b3%d2%af%d0%b9-%d0%b4%d1%8d%d0%bb%d1%85%d0%b8%d0%b9%d0%bd-%d1%86%d0%be%d1%80%d1%8b%d0%bd-%d0%b3%d0%b0%d0%bd%d1%86/
https://uynaa.wordpress.com/2013/02/01/%d1%83%d0%bb%d0%b7-%d0%bd%d1%83%d1%82%d0%b3%d0%b8%d0%b9%d0%bd-%d0%bf%d0%b8%d1%84%d0%b0%d0%b3%d0%be%d1%80/
https://uynaa.wordpress.com/2013/01/21/%d1%82%d0%b5%d0%bb%d0%b5%d0%b2%d0%b8%d0%b7%d0%b8%d0%b9%d0%bd-%d1%82%d2%af%d2%af%d1%85%d1%8d%d0%bd-%d0%b4%d1%8d%d1%85-%d1%85%d0%b0%d0%bc%d0%b3%d0%b8%d0%b9%d0%bd-%d0%b3%d0%b0%d0%b6%d0%b8%d0%b3-%d1%88/
https://uynaa.wordpress.com/2013/01/18/%d0%b0%d0%bf%d0%be%d1%84%d0%b8%d1%81-%d0%be%d0%be%d1%81-%d2%af%d2%af%d0%b4%d1%8d%d0%bd-%d3%a9%d1%80%d0%bd%d3%a9%d1%85-%d0%b6%d2%af%d0%b6%d0%b8%d0%b3/
https://uynaa.wordpress.com/2013/01/17/%d0%b0%d1%80%d0%b8%d1%83%d0%bd%d1%82%d0%bd%d1%8b-%d0%bd%d1%83%d1%82%d0%b0%d0%b3-%d0%b8%d0%b9%d0%b3-%d1%8d%d0%b7%d1%8d%d0%b3%d0%bd%d1%8d%d1%85-%d1%85%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/01/15/%d1%81%d0%b0%d1%83%d0%b4%d1%8b%d0%bd-%d1%82%d0%b0%d0%b3%d0%bd%d1%83%d1%83%d0%bb%d1%87%d0%b8%d0%b4-%d0%b0%d1%81%d0%b0%d0%b4%d1%8b%d0%b3-%d0%be%d0%bb%d0%b6%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/15/%d0%bc%d0%b0%d0%bb%d0%b8%d0%b3%d1%8d%d1%8d%d1%81-%d1%81%d0%be%d0%bc%d0%b0%d0%bb%d0%b8-%d1%85%d2%af%d1%80%d1%82%d1%8d%d0%bb/
https://uynaa.wordpress.com/2013/01/10/%d1%85%d0%be%d1%80%d0%b2%d0%be%d0%be-%d0%b5%d1%80%d1%82%d3%a9%d0%bd%d1%86-%d1%85%d0%b0%d0%bb%d0%b0%d0%b0%d1%81%d0%b0%d0%bd%d0%b4-%d0%b1%d0%b0%d0%b3%d1%82%d0%b0%d0%bd%d0%b0/
https://uynaa.wordpress.com/2013/01/10/%d1%82%d0%b0%d0%bd%d0%b3%d0%b0%d1%80%d0%b0%d0%b3-%d3%a9%d1%80%d0%b3%d3%a9%d1%85-%d1%91%d1%81%d0%bb%d0%be%d0%bb-%d1%85%d2%af%d0%bb%d1%8d%d1%8d%d0%b6-%d0%b1%d0%b0%d0%b9%d0%b3-%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/09/%d0%b1%d0%be%d0%bb%d0%bb%d0%b8%d0%b2%d1%83%d0%b4%d1%8b%d0%bd-%d0%ba%d0%b8%d0%bd%d0%be%d0%bd%d0%be%d0%be%d1%81-%d1%87-%d0%b0%d0%b9%d0%bc%d0%b0%d0%b0%d1%80/
https://uynaa.wordpress.com/2013/01/08/%d0%bf%d0%b5%d0%bd%d1%82%d0%b0%d0%b3%d0%be%d0%bd-%d0%b1%d0%be%d0%bb%d0%be%d0%bd-%d1%82%d1%82%d0%b3-%d1%8b%d0%b3-%d1%83%d0%b4%d0%b8%d1%80%d0%b4%d0%b0%d1%85-%d0%bc%d0%b0%d0%b3%d0%b0%d0%b4%d0%bb%d0%b0/
https://uynaa.wordpress.com/2013/01/07/%d0%b7%d0%b8%d0%b0%d0%b4-%d1%82%d0%b0%d0%ba%d0%b8%d0%b5%d0%b4%d0%b4%d0%b8%d0%bd/
...and so on.
EDIT: To filter the links only to specified category, you can use this script:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
all_links = []
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
soup = BeautifulSoup(data['html'], 'html.parser')
for p in soup.select('.post'):
if any('%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb' in cat['href'] for cat in p.select('[rel="category tag"]')):
if p.h2.a['href'] not in all_links:
print(p.h2.a['href'])
all_links.append(p.h2.a['href'])
if data['lastbatch']:
break
page += 1
print(len(all_links))
Prints 135 links:
...
https://uynaa.wordpress.com/2011/05/13/%e2%80%9c%d1%83%d1%85%d0%b0%d0%b0%d0%bd-%d0%bc%d1%83%d1%83%d1%82%d0%bd%d1%83%d1%83%d0%b4%d1%8b%d0%bd-%d2%af%d0%b5%e2%80%9d/
https://uynaa.wordpress.com/2011/05/04/%d2%af%d1%85%d0%bb%d0%b8%d0%b9%d0%bd-%d1%82%d0%be%d0%b3%d0%bb%d0%be%d0%be%d0%bc/
https://uynaa.wordpress.com/2011/05/04/%d0%be%d1%81%d0%b0%d0%bc%d0%b0-%d0%b1%d0%b8%d0%bd-%d0%bb%d0%b0%d0%b4%d0%b5%d0%bd%d0%b8%d0%b9%d0%b3-%d1%8f%d0%b0%d0%b6-%d0%b8%d0%bb%d1%80%d2%af%d2%af%d0%bb%d1%81%d1%8d%d0%bd-%d0%b1%d1%8d/
135
Not sure why your codes don't work. For me, I used the below codes to get all the links first.
list_href = []
a_tags = soup.find_all('a')
for tag in a_tags:
list_href.append(tag.get('href'))
The links of the articles are in list_href[5:26].

python crawling beautifulsoup how to crawl several pages?

Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()

Scraping links from buttons on a page

I am trying to scrape the links from the "box score" button on this page. The button is supposed to look like this
http://www.espn.com/nfl/boxscore?gameId=400874795
I tried to use this code to see if I could access the buttons but I cannot.
from bs4 import BeautifulSoup
import requests
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
advanced = url
r = requests.get(advanced)
data = r.text
soup = BeautifulSoup(data,"html.parser")
for link in soup.find_all('a'):
print link
As wpercy mentions in his comment, you can't do this using requests, as a suggestion you should use selenium together with Chromedriver/PhantomJS for handling the JavaScript:
from selenium import webdriver
from bs4 import BeautifulSoup
url = "http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2"
browser = webdriver.Chrome()
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html,'html.parser')
boxList = soup.findAll('a',{'name':'&lpos=nfl:scoreboard:boxscore'})
All score buttons's a tag have the attribute name = &lpos=nfl:scoreboard:boxscore, so we first use .findAll and now a simple list comprehension can extract each href attribute:
>>> links = [box['href'] for box in boxList]
>>> links
['/nfl/boxscore?gameId=400874795', '/nfl/boxscore?gameId=400874854', '/nfl/boxscore?gameId=400874753', '/nfl/boxscore?gameId=400874757', '/nfl/boxscore?gameId=400874772', '/nfl/boxscore?gameId=400874777', '/nfl/boxscore?gameId=400874767', '/nfl/boxscore?gameId=400874812', '/nfl/boxscore?gameId=400874761', '/nfl/boxscore?gameId=400874764', '/nfl/boxscore?gameId=400874781', '/nfl/boxscore?gameId=400874796', '/nfl/boxscore?gameId=400874750', '/nfl/boxscore?gameId=400873867', '/nfl/boxscore?gameId=400874775', '/nfl/boxscore?gameId=400874798']
here is the solution i did , and it scrapes all the link which are there on the url you have provided in your answer . you can check it out
# from BeautifulSoup import *
from bs4 import BeautifulSoup
# import requests
import urllib
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
# advanced = url
html = urllib.urlopen(url).read()
# r = requests.get(html)
# data = r.text
soup = BeautifulSoup(html)
tags = soup('a')
# for link in soup.find_all('a'):
for i,tag in enumerate(tags):
# print tag;
print i;
ans = tag.get('href',None)
print ans;
print "\n";
The answer from Gopal Chitalia didn't work for me, so I decided to post the working one (for python 3.6.5)
# from BeautifulSoup import *
from bs4 import BeautifulSoup
# import requests
import urllib
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
# advanced = url
html = urllib.request.urlopen(url)
# urlopen(url).read()
# r = requests.get(html)
# data = r.text
soup = BeautifulSoup(html)
tags = soup('a')
# for link in soup.find_all('a'):
for i,tag in enumerate(tags):
# print tag;
print (i);
ans = tag.get('href',None)
print (ans);
print ("\n");

extract text from html file python

I have write down a code to extract some text from the html file, This code extract the requested line from the webpage now I want to extract sequence data.Unfortunately I am not able to extract the text, its showing some error.
import urllib2
from HTMLParser import HTMLParser
import nltk
from bs4 import BeautifulSoup
# Proxy information were removed
# from these two lines
proxyOpener = urllib2.build_opener(proxyHandler)
urllib2.install_opener(proxyOpener)
response = urllib2.urlopen('http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c')
################## BS Block ################################
soup = BeautifulSoup(response)
text = soup.get_text()
print text
##########################################################
html = response.readline()
for l in html:
if "|Rv0470c|" in l:
print l # code is running successfully till here
raw = nltk.clean_html(html)
print raw
How can I run this code successfully? I have already checked all the available threads and solution, but nothing is working.
i want to extract this part:
M. tuberculosis H37Rv|Rv0470c|pcaA
MSVQLTPHFGNVQAHYDLSDDFFRLFLDPTQTYSCAYFERDDMTLQEAQIAKIDLALGKLNLEPGMTLLDIGCGWGATMRRAIEKYDVNVVGLTLSENQAGHVQKMFDQMDTPRSRRVLLEGWEKFDEPVDRIVSIGAFEHFGHQRYHHFFEVTHRTLPADGKMLLHTIVRPTFKEGREKGLTLTHELVHFTKFILAEIFPGGWLPSIPTVHEYAEKVGFRVTAVQSLQLHYARTLDMWATALEANKDQAIAIQSQTVYDRYMKYLTGCAKLFRQGYTDVDQFTLEK
i am able to extract desired text after writing down this code: which works without any dependencies accept "urllib2" and for my case it works like a charm.
import urllib2
httpProxy = {'username': '------', '-----': '-------', 'host': '------', 'port': '-----'}
proxyHandler = urllib2.ProxyHandler({'http': 'http://'+httpProxy['username']+':'+httpProxy['password']+'#'+httpProxy['host']+':'+httpProxy['port']})
proxyOpener = urllib2.build_opener(proxyHandler)
urllib2.install_opener(proxyOpener)
response = urllib2.urlopen('http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c')
html = response.readlines()
f = open("/home/zebrafish/Desktop/output.txt",'w')
for l in html:
if "|Rv0470c|" in l:
l = l.split("</small>")[0].split("<TR><TD><small style=font-family:courier>")[1]
l = l.split("<br />")
ttl = l[:1]
seq = "".join(l[1:])
f.write("".join(ttl))
f.write(seq)
f.close()
I'm not quite sure about what exactly you are requesting as a whole, but here's my ad hoc take on your problem (similar to yours actually) which does retrieve the part of the html you request. Maybe you can get some ideas. (adjust for Python2)
import requests
from bs4 import BeautifulSoup
url = 'http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c'
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, "lxml")
for n in soup.find_all('tr'):
if "|Rv0470c|" in n.text:
nt = n.text
while '\n' in nt:
nt.replace('\n','\t')
nt=nt.split('\t')
nt = [x for x in nt if "|Rv0470c|" in x][0].strip()
print (nt.lstrip('>'))

Categories