Parsing HTML with python request - python

im not a coder but i need to implement a simple HTML parser.
After a simple research i was able to implement as a given example:
from lxml import html
import requests
page = requests.get('https://URL.COM')
tree = html.fromstring(page.content)
#This will create a list of buyers:
buyers = tree.xpath('//div[#title="buyer-name"]/text()')
#This will create a list of prices
prices = tree.xpath('//span[#class="item-price"]/text()')
print 'Buyers: ', buyers
print 'Prices: ', prices
How can i use tree.xpath to parse all words ending with ".com.br" and starting with "://"

As #nosklo pointed out here, you are looking for href tags and the associated links. A parse tree will be organized by the html elements themselves, and you find text by searching those elements specifically. For urls, this would look like so (using the lxml library in python 3.6):
from lxml import etree
from io import StringIO
import requests
# Set explicit HTMLParser
parser = etree.HTMLParser()
page = requests.get('https://URL.COM')
# Decode the page content from bytes to string
html = page.content.decode("utf-8")
# Create your etree with a StringIO object which functions similarly
# to a fileHandler
tree = etree.parse(StringIO(html), parser=parser)
# Call this function and pass in your tree
def get_links(tree):
# This will get the anchor tags <a href...>
refs = tree.xpath("//a")
# Get the url from the ref
links = [link.get('href', '') for link in refs]
# Return a list that only ends with .com.br
return [l for l in links if l.endswith('.com.br')]
# Example call
links = get_links(tree)

Related

Python / BeautifulSoup - Scraping XML data from Clinicaltrials.gov API - parse data within XML parent/child tags

I'm new to working with XML and BeautifulSoup and I am trying to get a dataset of clinical trials using Clinicaltrials.gov's new API that converts a list of trials into an XML dataset. I tried using find_all() like I typically do with HTML, but I'm not having the same luck. I've tried a few other approaches, like converting to a string and splitting (very messy) but I don't want to clutter my code with failed attempts.
Bottom line: I want to extract all NCTIds (I know I can just convert the whole thing into a string and use regex, but I want to learn how to actually parse XML correctly) and official titles for each clinical trial listed in the XML file. Any help is appreciated!
import requests
from bs4 import BeautifulSoup
from lxml import etree
import lxml.html
url = 'https://clinicaltrials.gov/api/query/full_studies?expr=diabetes+telehealth+peer+support&+AREA%5BStartDate%5D+EXPAND%5BTerm%5D+RANGE%5B01%2F01%2F2020%2C+09%2F01%2F2020%5D&min_rnk=1&max_rnk=10&fmt=xml'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
m1_nctid = soup.find_all('Field Name="NCTId"') #This comes back with 0 results
m1_officialtitle = soup.find_all('Field Name="OfficialTitle"') #This comes back with 0 results
you can filter on attributes like following:
m1_nctid = soup.findAll("field", {"name" : "NCTId"})
m1_officialtitle = soup.findAll("field", {"name" : "OfficialTitle"})
and then iterate each result to get text, for ex:
official_titles = [result.text for result in m1_officialtitle]
for more info, you can check the documentation here
You can search for the field tag in lowercase, and pass name as an attribute to attrs. This works with just BeautifulSoup there's no need to use etree:
import requests
from bs4 import BeautifulSoup
url = "https://clinicaltrials.gov/api/query/full_studies?expr=diabetes+telehealth+peer+support&+AREA%5BStartDate%5D+EXPAND%5BTerm%5D+RANGE%5B01%2F01%2F2020%2C+09%2F01%2F2020%5D&min_rnk=1&max_rnk=10&fmt=xml"
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")
m1_nctid = soup.find_all("field", attrs={"name": "NCTId"})
m1_officialtitle = soup.find_all("field", attrs={"name": "OfficialTitle"})

HTML Scraping the website with duplicated div class name

I currently working on the HTML scraping the baka-update.
However, the name of Div Class is duplicated.
As my goal is as csv or json, I would like to use information in [sCat] as column name and [sContent] as to be get stored.....
Is their are way to scrape with this kinds of website?
Thanks,
Sample
https://www.mangaupdates.com/series.html?id=75363
Image 1
Image 2
from lxml import html
import requests
page = requests.get('http://www.mangaupdates.com/series.html?id=153558?')
tree = html.fromstring(page.content)
#Get the name of the columns.... I hope
sCat = tree.xpath('//div[#class="sCat"]/text()')
#Get the actual data
sContent = tree.xpath('//div[#class="sContent"]/text()')
print('sCat: ', sCat)
print('sContent: ', sContent)
I tried but nothing I could find of
#Jasper Nichol M Fabella
I tried to edit your code and got the following output. Maybe it will Help.
from lxml import html
import requests
page = requests.get('http://www.mangaupdates.com/series.html?id=153558?')
tree = html.fromstring(page.content)
# print(page.content)
#Get the name of the columns.... I hope
sCat = tree.xpath('//div[#class="sCat"]')
#Get the actual data
sContent = tree.xpath('//div[#class="sContent"]')
print('sCat: ', len(sCat))
print('sContent: ', len(sContent))
json_dict={}
for i in range(0,len(sCat)):
# print(''.join(i.itertext()))
sCat_text=(''.join(sCat[i].itertext()))
sContent_text=(''.join(sContent[i].itertext()))
json_dict[sCat_text]=sContent_text
print(json_dict)
I got the following output
Hope it Helps
you can use xpath expressions and create an absolute path on what you want to scrape
Here is an example with requests and lxml library:
from lxml import html
import requests
r = requests.get('https://www.mangaupdates.com/series.html?id=75363')
tree = html.fromstring(r.content)
sCat = [i.text_content().strip() for i in tree.xpath('//div[#class="sCat"]')]
sContent = [i.text_content().strip() for i in tree.xpath('//div[#class="sContent"]')]
What are you using to scrape?
If you are using BeautifulSoup? Then you can search for all content on the page with FindAll method with a class identifier and iterate thru that. You can the special "_class" deginator
Something like
import bs4
soup = bs4.BeautifulSoup(html.source)
soup.find_all('div', class_='sCat')
# do rest of your logic work here
Edit: I was typing on my mobile on cached page before you made the edits. So didnt see the changes. Though i see you are using raw lxml library to parse. Yes that's faster but I am not to familiar, as Ive only used raw lxml library for one project but I think you can chain two search methods to distill to something equivalent.

Beautiful soup - how to extract a string from an object

I am learning Beautiful soup. I have succeeded in tracking down the html lines that I need.
My next step is to extract an Id value from those lines.
The code to find the lines looks like this:
object = soup_station.find('img',{'src': re.compile("^Controls")})
If I now print object I will get this, for example:
<img src="Controls/RiverLevels/ChartImage.jpg?Id=471&ChartType=Histogram" id="StationDetails_Chart1_chartImage" alt="Current river level" />
The part I want to extract in the line above is the "471" after Id=.
I tried using re.search on object but it seems that object is not text.
Any help would be much appreciated!
You can adapt the following:
s = '<img src="Controls/RiverLevels/ChartImage.jpg?Id=471&ChartType=Histogram" id="StationDetails_Chart1_chartImage" alt="Current river level" />'
from bs4 import BeautifulSoup
import re
from urlparse import urlsplit, parse_qs
soup = BeautifulSoup(s)
# find the node with a src starting with Controls
node = soup.find('img',{'src': re.compile("^Controls")})
# Break up the url in the src attribute
url_split = urlsplit(node['src'])
# Parse out the query parameter from the url
qs = parse_qs(url_split.query)
# Display the value for `Id`
print qs['Id'][0]
You want to make sure that you are performing the regex search on the object's source. You can give this a try:
import re
ele = soup_station.find('img')
src = ele['src']
match = re.search(r'\?Id=(\d+)', src)
ele_id = match.group(1)

Write a python script that goes through the links on a page recursively

I'm doing a project for my school in which I would like to compare scam mails. I found this website: http://www.419scam.org/emails/
Now what I would like to do is to save every scam in apart documents then later on I can analyse them.
Here is my code so far:
import BeautifulSoup, urllib2
address='http://www.419scam.org/emails/'
html = urllib2.urlopen(address).read()
f = open('test.txt', 'wb')
f.write(html)
f.close()
This saves me the whole html file in a text format, now I would like to strip the file and save the content of the html links to the scams:
01
02
03
etc.
If i get that, I would still need to go a step further and open save another href. Any idea how do I do it in one python code?
Thank you!
You picked the right tool in BeautifulSoup. Technically you could do it all do it in one script, but you might want to segment it, because it looks like you'll be dealing with tens of thousands of e-mails, all of which are seperate requests - and that will take a while.
This page is gonna help you a lot, but here's just a little code snippet to get you started. This gets all of the html tags that are index pages for the e-mails, extracts their href links and appends a bit to the front of the url so they can be accessed directly.
from bs4 import BeautifulSoup
import re
import urllib2
soup = BeautifulSoup(urllib2.urlopen("http://www.419scam.org/emails/"))
tags = soup.find_all(href=re.compile("20......../index\.htm")
links = []
for t in tags:
links.append("http://www.419scam.org/emails/" + t['href'])
're' is a Python's regular expressions module. In the fifth line, I told BeautifulSoup to find all the tags in the soup whose href attribute match that regular expression. I chose this regular expression to get only the e-mail index pages rather than all of the href links on that page. I noticed that the index page links had that pattern for all of their URLs.
Having all the proper 'a' tags, I then looped through them, extracting the string from the href attribute by doing t['href'] and appending the rest of the URL to the front of the string, to get raw string URLs.
Reading through that documentation, you should get an idea of how to expand these techniques to grab the individual e-mails.
You might also find value in requests and lxml.html. Requests is another way to make http requests and lxml is an alternative for parsing xml and html content.
There are many ways to search the html document but you might want to start with cssselect.
import requests
from lxml.html import fromstring
url = 'http://www.419scam.org/emails/'
doc = fromstring(requests.get(url).content)
atags = doc.cssselect('a')
# using .get('href', '') syntax because not all a tags will have an href
hrefs = (a.attrib.get('href', '') for a in atags)
Or as suggested in the comments using .iterlinks(). Note that you will still need to filter if you only want 'a' tags. Either way the .make_links_absolute() call is probably going to be helpful. It is your homework though, so play around with it.
doc.make_links_absolute(base_url=url)
hrefs = (l[2] for l in doc.iterlinks() if l[0].tag == 'a')
Next up for you... how to loop through and open all of the individual spam links.
To get all links on the page you could use BeautifulSoup. Take a look at this page, it can help. It actually tells how to do exactly what you need.
To save all pages, you could do the same as what you do in your current code, but within a loop that would iterate over all links you'll have extracted and stored, say, in a list.
Heres a solution using lxml + XPath and urllib2 :
#!/usr/bin/env python2 -u
# -*- coding: utf8 -*-
import cookielib, urllib2
from lxml import etree
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
page = opener.open("http://www.419scam.org/emails/")
page.addheaders = [('User-agent', 'Mozilla/5.0')]
reddit = etree.HTML(page.read())
# XPath expression : we get all links under body/p[2] containing *.htm
for node in reddit.xpath('/html/body/p[2]/a[contains(#href,".htm")]'):
for i in node.items():
url = 'http://www.419scam.org/emails/' + i[1]
page = opener.open(url)
page.addheaders = [('User-agent', 'Mozilla/5.0')]
lst = url.split('/')
try:
if lst[6]: # else it's a "month" link
filename = '/tmp/' + url.split('/')[4] + '-' + url.split('/')[5]
f = open(filename, 'w')
f.write(page.read())
f.close()
except:
pass
# vim:ts=4:sw=4
You could use HTML parser and specify the type of object you are searching for.
from HTMLParser import HTMLParser
import urllib2
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for attr in attrs:
if attr[0] == 'href':
print attr[1]
address='http://www.419scam.org/emails/'
html = urllib2.urlopen(address).read()
f = open('test.txt', 'wb')
f.write(html)
f.close()
parser = MyHTMLParser()
parser.feed(html)

Script to parse weather

I'm trying to learn Python. My only experience is Applescripting and it's not so easy to learn.. so far anyway.
I'm trying to parse an xml weather site and so far I have the data I need but I can't figure out how to get it into a list to process it further. Can anyone help?
from BeautifulSoup import BeautifulSoup
import xml.etree.cElementTree as ET
from xml.etree.cElementTree import parse
import urllib2
url = "http://www.weatheroffice.gc.ca/rss/city/ab-52_e.xml"
response = urllib2.urlopen(url)
local_file = open("\Temp\weather.xml", "w")
local_file.write(response.read())
local_file.close()
invalid_tags = ['b', 'br']
tree = parse("\Temp\weather.xml")
stuff = tree.findall("channel/item/description")
item = stuff[1]
parsewx = BeautifulSoup(stuff[1].text)
for tag in invalid_tags:
for match in parsewx.findAll(tag):
match.replaceWithChildren()
print parsewx
Since XML is structured data, BeautifulSoup returns a tree of Tags.
The documentation has extensive information on how to search and navigate in that tree.

Categories