Scrape string of text from website with beautiful soup - python

I would like to scrape a webpage and just return the GTM (Google Tag Manager) container ID (In the example below it would be GTM-5LS3NZ). The code shouldn't look for the exact container ID but rather the pattern as I will use it on muultiple sites.
So far I can search the head and print the entire piece of text containing GTM, but I don't know how to format the find and the regex together to just return GTM-5LS3NZ (In this example).
import urllib3
import re
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
response = http.request('GET', "https://www.observepoint.com/")
soup = BeautifulSoup(response.data,"html.parser")
GTM = soup.head.findAll(text=re.compile(r'GTM'))
print(GTM)
Note: The GTM ID can have 6 or 7 alphanumeric characters so I would expect the regex for the container ID to be something like ^GTM-[A-Z0-9] - I don't know how to specify 6 or 7 characters.
Clarification on what I am after.
If you run the code above you get the following.
["(function (w, d, s, l, i) {\n w[l] = w[l] || [];\n w[l].push({\n 'gtm.start': new Date().getTime(),\n event: 'gtm.js'\n });\n var f = d.getElementsByTagName(s)[0],\n j = d.createElement(s),\n dl = l != 'dataLayer' ? '&l=' + l : '';\n j.async = true;\n j.src =\n 'https://www.googletagmanager.com/gtm.js?id=' + i + dl;\n f.parentNode.insertBefore(j, f);\n })(window, document, 'script', 'dataLayer', 'GTM-5LS3NZ');"]
Where all I want is GTM-5LS3NZ.

I did something similar a few days ago, and a quick rewrite gives me:
import urllib3
import re
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
response = http.request('GET', "https://www.observepoint.com/")
soup = BeautifulSoup(response.data,"html.parser")
pattern =re.compile(r'GTM-([a-zA-Z0-9]{6,7})')
found = soup.head.find(text=pattern)
if found:
match = pattern.search(found)
if match:
print(match.group(1))
This gives me GTM-5LS3NZ as output.

I have worked it out now, thanks to the help in the comments. This is what I was after:
import re
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
response = http.request('GET', "https://www.observepoint.com/")
soup = BeautifulSoup(response.data,"html.parser")
GTM = soup.head.findAll(text=re.compile(r'GTM'))
print(re.search("GTM-[A-Z0-9]{6,7}",str(GTM))[0])

You could also extract from appropriate comment
import requests
from bs4 import BeautifulSoup, Comment
r = requests.get('https://www.observepoint.com/')
soup = BeautifulSoup(r.content, 'lxml')
for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
if 'iframe' in comment:
soup = BeautifulSoup(comment, 'lxml')
id = soup.select_one('iframe')['src'].split('=')[1]
print(id)
break

Related

How to select one by one the element in web scraping using python

I want only h3[0] and h6[1], for example.
<div class="span16">
<h3>Shroot, Stephanie</h3>
<h6>Chemistry</h6>
<h6>December 2021</h6>
<p>Thesis or dissertation
<h3>Shroot</h3>
i use BeautifulSoup, and for loop to get information
url = line.strip()
r_html = requests.get(url, headers=headers).text
r_html_sc = requests.get(url, headers=headers).status_code
soup = BeautifulSoup(r_html, "html.parser")
thesis_infos = soup.find('div',{"class":"span16"})
if thesis_infos is not None:
thesis_infos_text = thesis_infos.text.strip()
else: thesis_infos_1 = " "
print(thesis_infos_text)
thesis_infos_lines = thesis_infos_text.readlines()
author1_1 = thesis_infos_lines[0]
year1_1 = thesis_infos_lines[2]
Edit:
The easiest way is probably to use BeautifulSoup, like so:
soup.find_all("h3")[0]
soup.find_all("h6")[1]
Here is a short example, filtering for links on google.com:
import requests as requests
from bs4 import BeautifulSoup
html = requests.get("https://www.google.com").text
soup = BeautifulSoup(html, "html.parser")
links = soup.findAll("a")
print(links[0])
Is this what you are looking for?
import re
code = """
<div class="span16">
<h3>Shroot, Stephanie</h3>
<h6>Chemistry</h6>
<h6>December 2021</h6>
<p>Thesis or dissertation
<h3>Shroot</h3>
"""
h3_matches = re.findall(".*<h3>(.+)<\\/h3>", code)
h6_matches = re.findall(".*<h6>(.+)<\\/h6>", code)
print(h3_matches[0])
print(h6_matches[1])
output:
Shroot, Stephanie
December 2021
thesis_infos = soup.find('div',{"class":"span16"})
code = str(thesis_infos)
h3_matches = re.findall(".*<h3>(.+)<\\/h3>", code)
h6_matches = re.findall(".*<h6>(.+)<\\/h6>", code)
print(h3_matches[0])
print(h6_matches[1])

Using multiple for loop with Python Using Beautiful Soup

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all
To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)

Extract date from multiple webpages with Python

I want to extract date when news article was published on websites. For some websites I have exact html element where date/time is (div, p, time) but on some websites I do not have:
These are the links for some websites (german websites):
(3 Nov 2020)
http://www.linden.ch/de/aktuelles/aktuellesinformationen/?action=showinfo&info_id=1074226
(Dec. 1, 2020) http://www.reutigen.ch/de/aktuelles/aktuellesinformationen/welcome.php?action=showinfo&info_id=1066837&ls=0&sq=&kategorie_id=&date_from=&date_to=
(10/22/2020) http://buchholterberg.ch/de/Gemeinde/Information/News/Newsmeldung?filterCategory=22&newsid=905
I have tried 3 different solutions with Python libs such as requests, htmldate and date_guesser but I'm always getting None, or in case of htmldate lib, I always get same date (2020.1.1)
from bs4 import BeautifulSoup
import requests
from htmldate import find_date
from date_guesser import guess_date, Accuracy
# Lib find_date
url = "http://www.linden.ch/de/aktuelles/aktuellesinformationen/?action=showinfo&info_id=1074226"
response = requests.get(url)
my_date = find_date(response.content, extensive_search=True)
print(my_date, '\n')
# Lib guess_date
url = "http://www.linden.ch/de/aktuelles/aktuellesinformationen/?action=showinfo&info_id=1074226"
my_date = guess_date(url=url, html=requests.get(url).text)
print(my_date.date, '\n')
# Lib Requests # I DO NOT GET last modified TAG
my_date = requests.head('http://www.linden.ch/de/aktuelles/aktuellesinformationen/?action=showinfo&info_id=1074226')
print(my_date.headers, '\n')
Am I doing something wrong?
Can you please tell me is there a way to extract date of publication from websites like this (where I do not have specific divs, p, and datetime elements).
IMPORTANT!
I want to make universal date extraction, so that I can put these links in for loop and run the same function to them.
I have never had much success with some of the date parsing libraries, so I usually go another route. I believe that the best method to extract the date strings from these sites in your question is with regular expressions.
website: linden.ch
import requests
import re as regex
from bs4 import BeautifulSoup
from datetime import datetime
url = "http://www.linden.ch/de/aktuelles/aktuellesinformationen/?action=showinfo&info_id=1074226"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
page_body = soup.find('body')
find_date = regex.search(r'(Datum der Neuigkeit)\s(\d{1,2}\W\s\w+\W\s\d{4})', str(page_body))
reformatted_timestamp = datetime.strptime(find_date.groups()[1], '%d. %b. %Y').strftime('%d-%m-%Y')
print(reformatted_timestamp)
# print output
03-11-2020
website: buchholterberg.ch
import requests
import re as regex
from bs4 import BeautifulSoup
from datetime import datetime
url = "http://buchholterberg.ch/de/Gemeinde/Information/News/Newsmeldung?filterCategory=22&newsid=905"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
page_body = soup.find('body')
find_date = regex.search(r'(Veröffentlicht)\s\w+:\s(\d{1,2}:\d{1,2}:\d{1,2})\s(\d{1,2}.\d{1,2}.\d{4})', str(page_body))
reformatted_timestamp = datetime.strptime(find_date.groups()[2], '%d.%m.%Y').strftime('%d-%m-%Y')
print(reformatted_timestamp)
# print output
22-10-2020
Update 12-04-2020
I looked at the source code for the two Python libraries: htmldate and date_guesser that you mentioned. Neither of these libraries can currently extract the date from the 3 sources that you listed in your question. The primary reason for this lack of extraction is linked to the date formats and language (german) of these target sites.
I had some free time so I put this together for you. The answer below can easily be modified to extract from any website and can be refined as needed based on the format of your target sources. It currently extract from all the links contained in URLs.
all urls
import requests
import re as regex
from bs4 import BeautifulSoup
def extract_date(can_of_soup):
page_body = can_of_soup.find('body')
clean_body = ''.join(str(page_body).replace('\n', ''))
if 'Datum der Neuigkeit' in clean_body or 'Veröffentlicht' in clean_body:
date_formats = '(Datum der Neuigkeit)\s(\d{1,2}\W\s\w+\W\s\d{4})|(Veröffentlicht am: \d{2}:\d{2}:\d{2} )(\d{1,2}.\d{1,2}.\d{4})'
find_date = regex.search(date_formats, clean_body, regex.IGNORECASE)
if find_date:
clean_tuples = [i for i in list(find_date.groups()) if i]
return ''.join(clean_tuples[1])
else:
tags = ['extra', 'elementStandard elementText', 'icms-block icms-information-date icms-text-gemeinde-color']
for tag in tags:
date_tag = page_body.find('div', {'class': f'{tag}'})
if date_tag is not None:
children = date_tag.findChildren()
if children:
find_date = regex.search(r'(\d{1,2}.\d{1,2}.\d{4})', str(children))
return ''.join(find_date.groups())
else:
return ''.join(date_tag.contents)
def get_soup(target_url):
response = requests.get(target_url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup
urls = {'http://www.linden.ch/de/aktuelles/aktuellesinformationen/?action=showinfo&info_id=1074226',
'http://www.reutigen.ch/de/aktuelles/aktuellesinformationen/welcome.php?action=showinfo&info_id=1066837&ls=0'
'&sq=&kategorie_id=&date_from=&date_to=',
'http://buchholterberg.ch/de/Gemeinde/Information/News/Newsmeldung?filterCategory=22&newsid=905',
'https://www.steffisburg.ch/de/aktuelles/meldungen/Hochwasserschutz-und-Laengsvernetzung-Zulg.php',
'https://www.wallisellen.ch/aktuellesinformationen/924227',
'http://www.winkel.ch/de/aktuellesre/aktuelles/aktuellesinformationen/welcome.php?action=showinfo&info_id'
'=1093910&ls=0&sq=&kategorie_id=&date_from=&date_to=',
'https://www.aeschi.ch/de/aktuelles/mitteilungen/artikel/?tx_news_pi1%5Bnews%5D=87&tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5Baction%5D=detail&cHash=ab4d329e2f1529d6e3343094b416baed'}
for url in urls:
html = get_soup(url)
article_date = extract_date(html)
print(article_date)

extract text from html file python

I have write down a code to extract some text from the html file, This code extract the requested line from the webpage now I want to extract sequence data.Unfortunately I am not able to extract the text, its showing some error.
import urllib2
from HTMLParser import HTMLParser
import nltk
from bs4 import BeautifulSoup
# Proxy information were removed
# from these two lines
proxyOpener = urllib2.build_opener(proxyHandler)
urllib2.install_opener(proxyOpener)
response = urllib2.urlopen('http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c')
################## BS Block ################################
soup = BeautifulSoup(response)
text = soup.get_text()
print text
##########################################################
html = response.readline()
for l in html:
if "|Rv0470c|" in l:
print l # code is running successfully till here
raw = nltk.clean_html(html)
print raw
How can I run this code successfully? I have already checked all the available threads and solution, but nothing is working.
i want to extract this part:
M. tuberculosis H37Rv|Rv0470c|pcaA
MSVQLTPHFGNVQAHYDLSDDFFRLFLDPTQTYSCAYFERDDMTLQEAQIAKIDLALGKLNLEPGMTLLDIGCGWGATMRRAIEKYDVNVVGLTLSENQAGHVQKMFDQMDTPRSRRVLLEGWEKFDEPVDRIVSIGAFEHFGHQRYHHFFEVTHRTLPADGKMLLHTIVRPTFKEGREKGLTLTHELVHFTKFILAEIFPGGWLPSIPTVHEYAEKVGFRVTAVQSLQLHYARTLDMWATALEANKDQAIAIQSQTVYDRYMKYLTGCAKLFRQGYTDVDQFTLEK
i am able to extract desired text after writing down this code: which works without any dependencies accept "urllib2" and for my case it works like a charm.
import urllib2
httpProxy = {'username': '------', '-----': '-------', 'host': '------', 'port': '-----'}
proxyHandler = urllib2.ProxyHandler({'http': 'http://'+httpProxy['username']+':'+httpProxy['password']+'#'+httpProxy['host']+':'+httpProxy['port']})
proxyOpener = urllib2.build_opener(proxyHandler)
urllib2.install_opener(proxyOpener)
response = urllib2.urlopen('http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c')
html = response.readlines()
f = open("/home/zebrafish/Desktop/output.txt",'w')
for l in html:
if "|Rv0470c|" in l:
l = l.split("</small>")[0].split("<TR><TD><small style=font-family:courier>")[1]
l = l.split("<br />")
ttl = l[:1]
seq = "".join(l[1:])
f.write("".join(ttl))
f.write(seq)
f.close()
I'm not quite sure about what exactly you are requesting as a whole, but here's my ad hoc take on your problem (similar to yours actually) which does retrieve the part of the html you request. Maybe you can get some ideas. (adjust for Python2)
import requests
from bs4 import BeautifulSoup
url = 'http://tuberculist.epfl.ch/quicksearch.php?gene+name=Rv0470c'
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, "lxml")
for n in soup.find_all('tr'):
if "|Rv0470c|" in n.text:
nt = n.text
while '\n' in nt:
nt.replace('\n','\t')
nt=nt.split('\t')
nt = [x for x in nt if "|Rv0470c|" in x][0].strip()
print (nt.lstrip('>'))

Building a python web scraper, Need help to get correct output

I was building a web-scraper using python.
The purpose of my scraper is to fetch all the links to websites from this webpage http://www.ebizmba.com/articles/torrent-websites
I want output like -
www.thepiratebay.se
www.kat.ph
I am new to python and scraping, and I was doing this just for practice. Please help me to get the right output.
My code --------------------------------------
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.ebizmba.com/articles/torrent-websites")
soup = BeautifulSoup(r.content, "html.parser")
data = soup.find_all("div", {"class:", "main-container-2"})
for item in data:
print(item.contents[1].find_all("a"))
My Output --- http://i.stack.imgur.com/Xi37B.png
If you are webscraping for practice, have a look at regular expressions.
This here would get just the headline links... The Needle string is the match string, the brackets (http://.*?) contain the match group.
import urllib2
import re
myURL = "http://www.ebizmba.com/articles/torrent-websites"
req = urllib2.Request(myURL)
Needle1 = '<p><a href="(http:.*?)" rel="nofollow" target="_blank">'
for match in re.finditer(Needle1, urllib2.urlopen(req).read()):
print(match.group(1))
Use .get('href') like this:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.ebizmba.com/articles/torrent-websites")
soup = BeautifulSoup(r.text, "html.parser")
data = soup.find_all("div", {"class:", "main-container-2"})
for i in data:
for j in i.contents[1].find_all("a"):
print(j.get('href'))
Full output:
http://www.thepiratebay.se
http://siteanalytics.compete.com/thepiratebay.se
http://quantcast.com/thepiratebay.se
http://www.alexa.com/siteinfo/thepiratebay.se/
http://www.kickass.to
http://siteanalytics.compete.com/kickass.to
http://quantcast.com/kickass.to
http://www.alexa.com/siteinfo/kickass.to/
http://www.torrentz.eu
http://siteanalytics.compete.com/torrentz.eu
http://quantcast.com/torrentz.eu
http://www.alexa.com/siteinfo/torrentz.eu/
http://www.extratorrent.cc
http://siteanalytics.compete.com/extratorrent.cc
http://quantcast.com/extratorrent.cc
http://www.alexa.com/siteinfo/extratorrent.cc/
http://www.yify-torrents.com
http://siteanalytics.compete.com/yify-torrents.com
http://quantcast.com/yify-torrents.com
http://www.alexa.com/siteinfo/yify-torrents.com
http://www.bitsnoop.com
http://siteanalytics.compete.com/bitsnoop.com
http://quantcast.com/bitsnoop.com
http://www.alexa.com/siteinfo/bitsnoop.com/
http://www.isohunt.to
http://siteanalytics.compete.com/isohunt.to
http://quantcast.com/isohunt.to
http://www.alexa.com/siteinfo/isohunt.to/
http://www.sumotorrent.sx
http://siteanalytics.compete.com/sumotorrent.sx
http://quantcast.com/sumotorrent.sx
http://www.alexa.com/siteinfo/sumotorrent.sx/
http://www.torrentdownloads.me
http://siteanalytics.compete.com/torrentdownloads.me
http://quantcast.com/torrentdownloads.me
http://www.alexa.com/siteinfo/torrentdownloads.me/
http://www.eztv.it
http://siteanalytics.compete.com/eztv.it
http://quantcast.com/eztv.it
http://www.alexa.com/siteinfo/eztv.it/
http://www.rarbg.com
http://siteanalytics.compete.com/rarbg.com
http://quantcast.com/rarbg.com
http://www.alexa.com/siteinfo/rarbg.com/
http://www.1337x.org
http://siteanalytics.compete.com/1337x.org
http://quantcast.com/1337x.org
http://www.alexa.com/siteinfo/1337x.org/
http://www.torrenthound.com
http://siteanalytics.compete.com/torrenthound.com
http://quantcast.com/torrenthound.com
http://www.alexa.com/siteinfo/torrenthound.com/
https://demonoid.org/
http://siteanalytics.compete.com/demonoid.pw
http://quantcast.com/demonoid.pw
http://www.alexa.com/siteinfo/demonoid.pw/
http://www.fenopy.se
http://siteanalytics.compete.com/fenopy.se
http://quantcast.com/fenopy.se
http://www.alexa.com/siteinfo/fenopy.se/

Categories