(Python) Trying to isolate some data from a website - python

Essentially the script will download images from wallbase.cc's random and toplist pages. Essentially it looks for a 7 digit string which identifies each image as that image. It the inputs that id into a url and downloads it. The only problem I seem to have is isolating the 7 digit string.
What I want to be able to do is..
Search for <div id="thumbxxxxxxx" and then assign xxxxxxx to a variable.
Here's what I have so far.
import urllib
import os
import sys
import re
#Written in Python 2.7 with LightTable
def get_id():
import urllib.request
req = urllib.request.Request('http://wallbase.cc/'+initial_prompt)
response = urllib.request.urlopen(req)
the_page = response.read()
for "data-id="" in the_page
def toplist():
#We need to define how to find the images to download
#The idea is to go to http://wallbase.cc/x and to take all of strings containing <a href="http://wallbase.cc/wallpaper/xxxxxxx" </a>
#And to request the image file from that URL.
#Then the file will be put in a user defined directory
image_id = raw_input("Enter the seven digit identifier for the image to be downloaded to "+ directory+ "...\n>>> ")
f = open(directory+image_id+ '.jpg','wb')
f.write(urllib.urlopen('http://wallpapers.wallbase.cc/rozne/wallpaper-'+image_id+'.jpg').read())
f.close()
directory = raw_input("Enter the directory in which the images will be downloaded.\n>>> ")
initial_prompt = input("What do you want to download from?\n\t1: Toplist\n\t2: Random\n>>> ")
if initial_prompt == 1:
urlid = 'toplist'
toplist()
elif initial_prompt == 2:
urlid = 'random'
random()
Any/all help is very much appreciated :)

You probably want to use a web scraping library like BeautifulSoup, see eg. this SO question on web scraping in Python.
import urllib2
from BeautifulSoup import BeautifulSoup
# download and parse HTML
url = 'http://wallbase.cc/toplist'
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
# find the links we want
links = soup('a', href=re.compile('^http://wallbase.cc/wallpaper/\d+$'))
for l in links:
href = l.get('href')
print href # u'http://wallbase.cc/wallpaper/1750539'
print href.split('/')[-1] # u'1750539'

If you want to only use the default library, you could use regular expressions.
pattern = re.compile(r'<div id="thumb(.{7})"')
...
for data-id in re.findall(pattern, the_page):
pass # do something with data-id

Related

Unique phrase in the source code of an HTML page in Python3

I'm trying to figure out how to get Python3 to display a certain phrase from an HTML document. For example, I'll be using the search engine https://duckduckgo.com .
I'd like the code to do key search for var error=document.getElementById; and get it to display what in the parenthesis are, in this case, it would be "error_homepage". Any help would be appreciated.
import urllib.request
u = input ('Please enter URL: ')
x = urllib.request.urlopen(u)
print(x.read())
You can simply read the website of interest, as you suggested, using urllib.request, and use regular expressions to search the retrieved HTML/JS/... code:
import re
import urllib.request
# the URL that data is read from
url = "http://..."
# the regex pattern for extracting element IDs
pattern = r"var error = document.getElementById\(['\"](?P<element_id>[a-zA-Z0-9_-]+)['\"]\);"
# fetch HTML code
with urllib.request.urlopen(url) as f:
html = f.read().decode("utf8")
# extract element IDs
for m in re.findall(pattern, html):
print(m)

Searching through HTML pages for certain text?

I wanted to play around with python to learn it, so I'm taking on a little project, but a part of it requires me to search for a name on this list:
https://bughunter.withgoogle.com/characterlist/1
(the number one is to be incremented by one every time to search for the name)
So I will be HTML scraping it, I'm new to python and would appreciate if someone could give me an example of how to make this work.
import json
import requests
from bs4 import BeautifulSoup
URL = 'https://bughunter.withgoogle.com'
def get_page_html(page_num):
r = requests.get('{}/characterlist/{}'.format(URL, page_num))
r.raise_for_status()
return r.text
def get_page_profiles(page_html):
page_profiles = {}
soup = BeautifulSoup(page_html)
for table_cell in soup.find_all('td'):
profile_name = table_cell.find_next('h2').text
profile_url = table_cell.find_next('a')['href']
page_profiles[profile_name] = '{}{}'.format(URL, profile_url)
return page_profiles
if __name__ == '__main__':
all_profiles = {}
for page_number in range(1, 81):
current_page_html = get_page_html(page_number)
current_page_profiles = get_page_profiles(current_page_html)
all_profiles.update(current_page_profiles)
with open('google_hall_of_fame_profiles.json', 'w') as f:
json.dump(all_profiles, f, indent=2)
Your question wasn't clear about how you wanted the data structured after scraping so I just saved the profiles in a dict (with the key/value pair as {profile_name: profile_url}) and then dumped the results to a json file.
Let me know if anything is unclear!
Try this. You will need to install bs4 first (python 3). It will get all of the names of the people on the website page:
from bs4 import BeautifulSoup as soup
import urllib.request
text=str(urllib.request.urlopen('https://bughunter.withgoogle.com/characterlist/1').read())
text=soup(text)
print(text.findAll(class_='item-list')[0].get_text())

Python, Limiting search at a specific hyperlink on webpage

I am finding a way to download .pdf file through hyperlinks on a webpage.
Learned from How can i grab pdf links from website with Python script, the way is:
import lxml.html, urllib2, urlparse
base_url = 'http://www.renderx.com/demos/examples.html'
res = urllib2.urlopen(base_url)
tree = lxml.html.fromstring(res.read())
ns = {'re': 'http://exslt.org/regular-expressions'}
for node in tree.xpath('//a[re:test(#href, "\.pdf$", "i")]', namespaces=ns):
print urlparse.urljoin(base_url, node.attrib['href'])
The question is, how can I only find the .pdf under a specific hyperlink, instead of listing all the .pdf(s) on the webpage?
A way is, I can limit the print when it contains certain words like:
If ‘CA-Personal.pdf’ in node:
But what if the .pdf file name is changing? Or I just want to limit the searching on the webpage, at the hyperlink of “Applications”? thanks.
well, not the best way but no harm to do:
from bs4 import BeautifulSoup
import urllib2
domain = 'http://www.renderx.com'
url = 'http://www.renderx.com/demos/examples.html'
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
app = soup.find_all('a', text = "Applications")
for aa in app:
print domain + aa['href']

Downloading files from multiple websites.

This is my first Python project so it is very basic and rudimentary.
I often have to clean off viruses for friends and the free programs that I use are updated often. Instead of manually downloading each program, I was trying to create a simple way to automate the process. Since I am also trying to learn python I thought it would be a good opportunity to practice.
Questions:
I have to find the .exe file with some of the links. I can find the correct URL, but I get an error when it tries to download.
Is there a way to add all of the links into a list, and then create a function to go through the list and run the function on each url? I've Google'd quite a bit and I just cannot seem to make it work. Maybe I am not thinking in the right direction?
import urllib, urllib2, re, os
from BeautifulSoup import BeautifulSoup
# Website List
sas = 'http://cdn.superantispyware.com/SUPERAntiSpyware.exe'
tds = 'http://support.kaspersky.com/downloads/utils/tdsskiller.exe'
mbam = 'http://www.bleepingcomputer.com/download/malwarebytes-anti-malware/dl/7/?1'
tr = 'http://www.simplysup.com/tremover/download.html'
urllist = [sas, tr, tds, tr]
urrllist2 = []
# Find exe files to download
match = re.compile('\.exe')
data = urllib2.urlopen(urllist)
page = BeautifulSoup(data)
# Check links
#def findexe():
for link in page.findAll('a'):
try:
href = link['href']
if re.search(match, href):
urllist2.append(href)
except KeyError:
pass
os.chdir(r"C:\_VirusFixes")
urllib.urlretrieve(urllist2, os.path.basename(urllist2))
As you can see, I have left the function commented out as I cannot get it to work correctly.
Should I abandon the list and just download them individually? I was trying to be efficient.
Any suggestions or if you could point me in the right direction, it would be most appreciated.
In addition to mikez302's answer, here's a slightly more readable way to write your code:
import os
import re
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
websites = [
'http://cdn.superantispyware.com/SUPERAntiSpyware.exe'
'http://support.kaspersky.com/downloads/utils/tdsskiller.exe'
'http://www.bleepingcomputer.com/download/malwarebytes-anti-malware/dl/7/?1'
'http://www.simplysup.com/tremover/download.html'
]
download_links = []
for url in websites:
connection = urllib2.urlopen(url)
soup = BeautifulSoup(connection)
connection.close()
for link in soup.findAll('a', {href: re.compile(r'\.exe$')}):
download_links.append(link['href'])
for url in download_links:
urllib.urlretrieve(url, r'C:\_VirusFixes', os.path.basename(url))
urllib2.urlopen is a function for accessing a single URL. If you want to access multiple ones, you should loop over the list. You should do something like this:
for url in urllist:
data = urllib2.urlopen(url)
page = BeautifulSoup(data)
# Check links
for link in page.findAll('a'):
try:
href = link['href']
if re.search(match, href):
urllist2.append(href)
except KeyError:
pass
os.chdir(r"C:\_VirusFixes")
urllib.urlretrieve(urllist2, os.path.basename(urllist2))
The code above didn't work for me, in my case it was because the pages assemble their links through a script instead of including it in the code. When I ran into that problem I used the following code which is just a scraper:
import os
import re
import urllib
import urllib2
from bs4 import BeautifulSoup
url = ''
connection = urllib2.urlopen(url)
soup = BeautifulSoup(connection) #Everything the same up to here
regex = '(.+?).zip' #Here we insert the pattern we are looking for
pattern = re.compile(regex)
link = re.findall(pattern,str(soup)) #This finds all the .zip (.exe) in the text
x=0
for i in link:
link[x]=i.split(' ')[len(i.split(' '))-1]
# When it finds all the .zip, it usually comes back with a lot of undesirable
# text, luckily the file name is almost always separated by a space from the
# rest of the text which is why we do the split
x+=1
os.chdir("F:\Documents")
# This is the filepath where I want to save everything I download
for i in link:
urllib.urlretrieve(url,filename=i+".zip") # Remember that the text we found doesn't include the .zip (or .exe in your case) so we want to reestablish that.
This is not as efficient as the codes in the previous answers but it will work for most almost any site.

Write a python script that goes through the links on a page recursively

I'm doing a project for my school in which I would like to compare scam mails. I found this website: http://www.419scam.org/emails/
Now what I would like to do is to save every scam in apart documents then later on I can analyse them.
Here is my code so far:
import BeautifulSoup, urllib2
address='http://www.419scam.org/emails/'
html = urllib2.urlopen(address).read()
f = open('test.txt', 'wb')
f.write(html)
f.close()
This saves me the whole html file in a text format, now I would like to strip the file and save the content of the html links to the scams:
01
02
03
etc.
If i get that, I would still need to go a step further and open save another href. Any idea how do I do it in one python code?
Thank you!
You picked the right tool in BeautifulSoup. Technically you could do it all do it in one script, but you might want to segment it, because it looks like you'll be dealing with tens of thousands of e-mails, all of which are seperate requests - and that will take a while.
This page is gonna help you a lot, but here's just a little code snippet to get you started. This gets all of the html tags that are index pages for the e-mails, extracts their href links and appends a bit to the front of the url so they can be accessed directly.
from bs4 import BeautifulSoup
import re
import urllib2
soup = BeautifulSoup(urllib2.urlopen("http://www.419scam.org/emails/"))
tags = soup.find_all(href=re.compile("20......../index\.htm")
links = []
for t in tags:
links.append("http://www.419scam.org/emails/" + t['href'])
're' is a Python's regular expressions module. In the fifth line, I told BeautifulSoup to find all the tags in the soup whose href attribute match that regular expression. I chose this regular expression to get only the e-mail index pages rather than all of the href links on that page. I noticed that the index page links had that pattern for all of their URLs.
Having all the proper 'a' tags, I then looped through them, extracting the string from the href attribute by doing t['href'] and appending the rest of the URL to the front of the string, to get raw string URLs.
Reading through that documentation, you should get an idea of how to expand these techniques to grab the individual e-mails.
You might also find value in requests and lxml.html. Requests is another way to make http requests and lxml is an alternative for parsing xml and html content.
There are many ways to search the html document but you might want to start with cssselect.
import requests
from lxml.html import fromstring
url = 'http://www.419scam.org/emails/'
doc = fromstring(requests.get(url).content)
atags = doc.cssselect('a')
# using .get('href', '') syntax because not all a tags will have an href
hrefs = (a.attrib.get('href', '') for a in atags)
Or as suggested in the comments using .iterlinks(). Note that you will still need to filter if you only want 'a' tags. Either way the .make_links_absolute() call is probably going to be helpful. It is your homework though, so play around with it.
doc.make_links_absolute(base_url=url)
hrefs = (l[2] for l in doc.iterlinks() if l[0].tag == 'a')
Next up for you... how to loop through and open all of the individual spam links.
To get all links on the page you could use BeautifulSoup. Take a look at this page, it can help. It actually tells how to do exactly what you need.
To save all pages, you could do the same as what you do in your current code, but within a loop that would iterate over all links you'll have extracted and stored, say, in a list.
Heres a solution using lxml + XPath and urllib2 :
#!/usr/bin/env python2 -u
# -*- coding: utf8 -*-
import cookielib, urllib2
from lxml import etree
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
page = opener.open("http://www.419scam.org/emails/")
page.addheaders = [('User-agent', 'Mozilla/5.0')]
reddit = etree.HTML(page.read())
# XPath expression : we get all links under body/p[2] containing *.htm
for node in reddit.xpath('/html/body/p[2]/a[contains(#href,".htm")]'):
for i in node.items():
url = 'http://www.419scam.org/emails/' + i[1]
page = opener.open(url)
page.addheaders = [('User-agent', 'Mozilla/5.0')]
lst = url.split('/')
try:
if lst[6]: # else it's a "month" link
filename = '/tmp/' + url.split('/')[4] + '-' + url.split('/')[5]
f = open(filename, 'w')
f.write(page.read())
f.close()
except:
pass
# vim:ts=4:sw=4
You could use HTML parser and specify the type of object you are searching for.
from HTMLParser import HTMLParser
import urllib2
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for attr in attrs:
if attr[0] == 'href':
print attr[1]
address='http://www.419scam.org/emails/'
html = urllib2.urlopen(address).read()
f = open('test.txt', 'wb')
f.write(html)
f.close()
parser = MyHTMLParser()
parser.feed(html)

Categories