extract text from html file python

extract text from html file python - python

I have write down a code to extract some text from the html file, This code extract the requested line from the webpage now I want to extract sequence data.Unfortunately I am not able to extract the text, its showing some error.
import urllib2
from HTMLParser import HTMLParser
import nltk
from bs4 import BeautifulSoup
# Proxy information were removed
# from these two lines
proxyOpener = urllib2.build_opener(proxyHandler)
urllib2.install_opener(proxyOpener)
response = urllib2.urlopen('http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c')
################## BS Block ################################
soup = BeautifulSoup(response)
text = soup.get_text()
print text
##########################################################
html = response.readline()
for l in html:
if "|Rv0470c|" in l:
print l # code is running successfully till here
raw = nltk.clean_html(html)
print raw
How can I run this code successfully? I have already checked all the available threads and solution, but nothing is working.
i want to extract this part:
M. tuberculosis H37Rv|Rv0470c|pcaA
MSVQLTPHFGNVQAHYDLSDDFFRLFLDPTQTYSCAYFERDDMTLQEAQIAKIDLALGKLNLEPGMTLLDIGCGWGATMRRAIEKYDVNVVGLTLSENQAGHVQKMFDQMDTPRSRRVLLEGWEKFDEPVDRIVSIGAFEHFGHQRYHHFFEVTHRTLPADGKMLLHTIVRPTFKEGREKGLTLTHELVHFTKFILAEIFPGGWLPSIPTVHEYAEKVGFRVTAVQSLQLHYARTLDMWATALEANKDQAIAIQSQTVYDRYMKYLTGCAKLFRQGYTDVDQFTLEK

i am able to extract desired text after writing down this code: which works without any dependencies accept "urllib2" and for my case it works like a charm.
import urllib2
httpProxy = {'username': '------', '-----': '-------', 'host': '------', 'port': '-----'}
proxyHandler = urllib2.ProxyHandler({'http': 'http://'+httpProxy['username']+':'+httpProxy['password']+'#'+httpProxy['host']+':'+httpProxy['port']})
proxyOpener = urllib2.build_opener(proxyHandler)
urllib2.install_opener(proxyOpener)
response = urllib2.urlopen('http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c')
html = response.readlines()
f = open("/home/zebrafish/Desktop/output.txt",'w')
for l in html:
if "|Rv0470c|" in l:
l = l.split("</small>")[0].split("<TR><TD><small style=font-family:courier>")[1]
l = l.split("<br />")
ttl = l[:1]
seq = "".join(l[1:])
f.write("".join(ttl))
f.write(seq)
f.close()

I'm not quite sure about what exactly you are requesting as a whole, but here's my ad hoc take on your problem (similar to yours actually) which does retrieve the part of the html you request. Maybe you can get some ideas. (adjust for Python2)
import requests
from bs4 import BeautifulSoup
url = 'http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c'
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, "lxml")
for n in soup.find_all('tr'):
if "|Rv0470c|" in n.text:
nt = n.text
while '\n' in nt:
nt.replace('\n','\t')
nt=nt.split('\t')
nt = [x for x in nt if "|Rv0470c|" in x][0].strip()
print (nt.lstrip('>'))

Related

Using multiple for loop with Python Using Beautiful Soup

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all

To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)

how to use python to parse a html that is in txt format?

I am trying to parse a txt, example as below link.
The txt, however, is in the form of html. I am trying to get "COMPANY CONFORMED NAME" which located at the top of the file, and my function should return "Monocle Acquisition Corp".
https://www.sec.gov/Archives/edgar/data/1754170/0001571049-19-000004.txt
I have tried below:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sec.gov/Archives/edgar/data/1754170/0001571049-19-000004.txt'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html")
However, "soup" does not contain "COMPANY CONFORMED NAME" at all.
Can someone point me in the right direction?

The data you are looking for is not in an HTML structure so Beautiful Soup is not the best tool. The correct and fast way of searching for this data is just using a simple Regular Expression like this:
import re
import requests
url = 'https://www.sec.gov/Archives/edgar/data/1754170/0001571049-19-000004.txt'
r = requests.get(url)
text_string = r.content.decode()
name_re = re.compile("COMPANY CONFORMED NAME:[\\t]*(.+)\n")
match = name_re.search(text_string).group(1)
print(match)

the part you look like is inside a huge tag <SEC-HEADER>
you can get the whole section by using soup.find('sec-header')
but you will need to parse the section manually, something like this works, but it's some dirty job :
(view it in replit : https://repl.it/#gui3/stackoverflow-parsing-html)
import requests
from bs4 import BeautifulSoup
url = 'https://www.sec.gov/Archives/edgar/data/1754170/0001571049-19-000004.txt'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html")
header = soup.find('sec-header').text
company_name = None
for line in header.split('\n'):
split = line.split(':')
if len(split) > 1 :
key = split[0]
value = split[1]
if key.strip() == 'COMPANY CONFORMED NAME':
company_name = value.strip()
break
print(company_name)
There may be some library able to parse this data better than this code

How to scrape data from interactive chart using python?

I have a next link which represent an exact graph I want to scrape: https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1
I'm simply can't understand is it a xml or svg graph and how to scrape data. I think I need to use bs4, requests but don't know the way to do that.
Anyone could help?

You will load HTML like this:
import requests
url = "https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1"
resp = requests.get(url)
data = resp.text
Then you will create a BeatifulSoup object with this HTML.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, features="html.parser")
After this, it is usually very subjective how to parse out what you want. The candidate codes may vary a lot. This is how I did it:
Using BeautifulSoup, I parsed all "rect"s and check if "onmouseover" exists in that rect.
rects = soup.svg.find_all("rect")
yx_points = []
for rect in rects:
if rect.has_attr("onmouseover"):
text = rect["onmouseover"]
x_start_index = text.index("'") + 1
y_finish_index = text[x_start_index:].index("'") + x_start_index
yx = text[x_start_index:y_finish_index].split()
print(text[x_start_index:y_finish_index])
yx_points.append(yx)
As you can see from the image below, I scraped onmouseover= part and get those 02.2015 155,1 parts.
Here, this is how yx_points looks like now:
[['12.2009', '100,0'], ['01.2010', '101,8'], ['02.2010', '103,7'], ...]

from bs4 import BeautifulSoup
import requests
import re
#First get all the text from the url.
url="https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1"
response = requests.get(url)
html = response.text
#Find all the tags in which the data is stored.
soup = BeautifulSoup(html, 'lxml')
texts = soup.findAll("rect")
final = []
for each in texts:
names = each.get('onmouseover')
try:
q = re.findall(r"'(.*?)'", names)
final.append(q[0])
except Exception as e:
print(e)
#The details are appended to the final variable

Scrape string of text from website with beautiful soup

I would like to scrape a webpage and just return the GTM (Google Tag Manager) container ID (In the example below it would be GTM-5LS3NZ). The code shouldn't look for the exact container ID but rather the pattern as I will use it on muultiple sites.
So far I can search the head and print the entire piece of text containing GTM, but I don't know how to format the find and the regex together to just return GTM-5LS3NZ (In this example).
import urllib3
import re
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
response = http.request('GET', "https://www.observepoint.com/")
soup = BeautifulSoup(response.data,"html.parser")
GTM = soup.head.findAll(text=re.compile(r'GTM'))
print(GTM)
Note: The GTM ID can have 6 or 7 alphanumeric characters so I would expect the regex for the container ID to be something like ^GTM-[A-Z0-9] - I don't know how to specify 6 or 7 characters.
Clarification on what I am after.
If you run the code above you get the following.
["(function (w, d, s, l, i) {\n w[l] = w[l] || [];\n w[l].push({\n 'gtm.start': new Date().getTime(),\n event: 'gtm.js'\n });\n var f = d.getElementsByTagName(s)[0],\n j = d.createElement(s),\n dl = l != 'dataLayer' ? '&l=' + l : '';\n j.async = true;\n j.src =\n 'https://www.googletagmanager.com/gtm.js?id=' + i + dl;\n f.parentNode.insertBefore(j, f);\n })(window, document, 'script', 'dataLayer', 'GTM-5LS3NZ');"]
Where all I want is GTM-5LS3NZ.

I did something similar a few days ago, and a quick rewrite gives me:
import urllib3
import re
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
response = http.request('GET', "https://www.observepoint.com/")
soup = BeautifulSoup(response.data,"html.parser")
pattern =re.compile(r'GTM-([a-zA-Z0-9]{6,7})')
found = soup.head.find(text=pattern)
if found:
match = pattern.search(found)
if match:
print(match.group(1))
This gives me GTM-5LS3NZ as output.

I have worked it out now, thanks to the help in the comments. This is what I was after:
import re
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
response = http.request('GET', "https://www.observepoint.com/")
soup = BeautifulSoup(response.data,"html.parser")
GTM = soup.head.findAll(text=re.compile(r'GTM'))
print(re.search("GTM-[A-Z0-9]{6,7}",str(GTM))[0])

You could also extract from appropriate comment
import requests
from bs4 import BeautifulSoup, Comment
r = requests.get('https://www.observepoint.com/')
soup = BeautifulSoup(r.content, 'lxml')
for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
if 'iframe' in comment:
soup = BeautifulSoup(comment, 'lxml')
id = soup.select_one('iframe')['src'].split('=')[1]
print(id)
break

How do i get a specific word phrase out of a word soup with beautiful soup?

I already sorted my code with BeautifulSoup and come out with this:
<bound method Tag.prettify of <script type="text/javascript">var LifeTimeStats = [{"Key":"Top 3","Value":"31"},{"Key":"Top 5s","Value":"36"},{"Key":"Top 3s","Value":"13"},{"Key":"Top 6s","Value":"27"},{"Key":"Top 12s","Value":"76"},{"Key":"Top 25s","Value":"58"},{"Key":"Score","Value":"99,788"},{"Key":"Matches Played","Value":"502"},{"Key":"Wins","Value":"9"},{"Key":"Win%","Value":"2%"},{"Key":"Kills","Value":"730"},{"Key":"K/d","Value":"1.48"}];</script>>
I am trying to get the specific Value "730"
from this :
{"Key":"Kills","Value":"730"}
As there are no HTML tags I can sort by. I have no idea, how to get this specific value. Do you have any idea?
Maybe there is another solution to get there...
Here is the full code:
#----WEB INPUT BASIC----
#import bs4
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
#setting my url
url = 'https://fortnitetracker.com/profile/psn/Rehgum'
#making my https page work
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
urlopen(req).close()
#html parsing
page_soup = soup(webpage, "html.parser")
lifetime = page_soup.findAll("script",{"type":"text/javascript"})
stats = lifetime[3]
specific = stats.prettify
value = specific.text
#from here there is just code to put that value in a .txt file

This is just an idea of what you could do:
Extract the JS code into a Python variable.
Make a regex operation extracting the value of the variable.
"JSONify" such variable value.
Extract the data you need.
As an extract:
a = '''var LifeTimeStats = [{"Key":"Top 3","Value":"31"},{"Key":"Top 5s","Value":"36"},{"Key":"Top 3s","Value":"13"},{"Key":"Top 6s","Value":"27"},{"Key":"Top 12s","Value":"76"},{"Key":"Top 25s","Value":"58"},{"Key":"Score","Value":"99,788"},{"Key":"Matches Played","Value":"502"},{"Key":"Wins","Value":"9"},{"Key":"Win%","Value":"2%"},{"Key":"Kills","Value":"730"},{"Key":"K/d","Value":"1.48"}];'''
b = re.findall(r'var.*?=\s*(.*?);', a)[0]
c = json.loads(b)
See the dummy full code I wrote.
UPDATE
After seeing the full code... This could be a solution for your problem.

I finally got it working!
The thing that produced my errors was the "def loop():" part.
Here is the final working code:
def loop():
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import json
import re
import time
#setting my url
url = 'https://fortnitetracker.com/profile/psn/Rehgum'
#making my https page work
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
urlopen(req).close()
#html parsing
page_soup = soup(webpage, "html.parser")
lifetime = page_soup.findAll("script",{"type":"text/javascript"})
stats = lifetime[3]
stats_var = re.findall(r'var.*?=\s*(.*?);', stats.text)[0]
vals = json.loads(stats_var)
for val in vals:
if val['Key'] == 'Kills':
num_kills = val['Value']
break
print('Num kills = {}'.format(num_kills))
with open('lifetime_wins.txt', 'w') as fd:
fd.write(str(num_kills))
time.sleep(30)
loop()
for i in range(1,2):
loop()
while i<1:
print ("Ende")
Big "Thank you" to #kazbeel. You saved my Day! +rep

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

extract text from html file python - python

Related

Using multiple for loop with Python Using Beautiful Soup

how to use python to parse a html that is in txt format?

How to scrape data from interactive chart using python?

Scrape string of text from website with beautiful soup

How do i get a specific word phrase out of a word soup with beautiful soup?

Categories

Resources