Python About check the Google Search URL's contents loop issues - python

# -*- coding: utf-8 -*-
import re
import csv
import urllib
import urllib2
import BeautifulSoup
Filter = [' ab1',' ab2',' dc4',....]
urllists = ['myurl1','myurl2','myurl3',...]
csvfile = file('csv_test.csv','wb')
writer = csv.writer(csvfile)
writer.writerow(['keyword','url'])
for eachUrl in urllists:
for kword in Filter:
keyword = "site:" + urllib.quote_plus(eachUrl) + kword
safeKeyword = urllib.quote_plus(keyword)
fullQuery = 'http://www.google.com/search?sourceid=chrome&client=ubuntu&channel=cs& ie=UTF-8&q=' + safeKeyword
req = urllib2.Request(fullQuery, headers = {'User-Agent': 'Mozilla/15.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/12.04 Chrome/21.0.118083 Safari/535.11'})
html = urllib2.urlopen(req).read()
soup = BeautifulSoup.BeautifulSoup(html, fromEncoding = 'utf8')
resultURLList = [t.a['href'] for t in soup.findAll('h3', {'class':'r'})]
if resultURLList:
for l in resultURLList:
needCheckHtml = urllib2.urlopen(l).read()
if needCheckHtml:
x = re.compile(r"\b" + kword + r"\b")
p = x.search(needCheckHtml)
if p:
data = [kword, l]
writer.writerow(data)
else:
print '%s: No Results' % kword
csvfile.close()
A simple script about checking the url shows on google searchresults, and open it, check and match the keyword in list Filter use re, the above code, may cause some Error, for example, HTTPERROR, URLError, but i dont know how to fix and impove the code, can someone help me with that? Please..
if face some google reject, wanna use os.system("rasdial name user code") to reconnect the PPPOE and change the IP, so how fix this code
Thanks very much !!

I'm not sure how much this helps, but there is a search API that you can use without Google blocking your request and without the need to change your IP address; although there are some restrictions here as well.
http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=AnT4i
{"responseData": {"results":[{"GsearchResultClass":"GwebSearch","unescapedUrl":"http://www.ncbi.nlm.nih.gov/pubmed/11526138","url":"http://www.ncbi.nlm.nih.gov/pubmed/11526138","visibleUrl":"www.ncbi.nlm.nih.gov","cacheUrl":"","title":"Identification of aminoglycoside-modifying enzymes by susceptibility \u003cb\u003e...\u003c/b\u003e","titleNoFormatting":"Identification of aminoglycoside-modifying enzymes by susceptibility ...","content":"In 381 Japanese MRSA isolates, the \u003cb\u003eant(4\u0026#39;)-I\u003c/b\u003e, aac(6\u0026#39;)-aph(2\u0026quot;), and aph(3\u0026#39;)-III genes \u003cb\u003e...\u003c/b\u003e Isolates with only the \u003cb\u003eant(4\u0026#39;)-I\u003c/b\u003e gene had coagulase type II or III, but isolates \u003cb\u003e...\u003c/b\u003e"},{"GsearchResultClass":"GwebSearch","unescapedUrl":"http://www.ncbi.nlm.nih.gov/pubmed/1047990","url":"http://www.ncbi.nlm.nih.gov/pubmed/1047990","visibleUrl":"www.ncbi.nlm.nih.gov","cacheUrl":"","title":"[\u003cb\u003eANT(4\u0026#39;)I\u003c/b\u003e: a new aminoglycoside nucleotidyltransferase found in \u003cb\u003e...\u003c/b\u003e","titleNoFormatting":"[ANT(4\u0026#39;)I: a new aminoglycoside nucleotidyltransferase found in ...","content":"[\u003cb\u003eANT(4\u0026#39;)I\u003c/b\u003e: a new aminoglycoside nucleotidyltransferase found in \u0026quot;staphylococcus aureus\u0026quot; (author\u0026#39;s transl)]. [Article in French]. Le Goffic F, Baca B, Soussy CJ, \u003cb\u003e...\u003c/b\u003e"},{"GsearchResultClass":"GwebSearch","unescapedUrl":"http://jcm.asm.org/content/27/11/2535","url":"http://jcm.asm.org/content/27/11/2535","visibleUrl":"jcm.asm.org","cacheUrl":"","title":"Use of plasmid analysis and determination of aminoglycoside \u003cb\u003e...\u003c/b\u003e","titleNoFormatting":"Use of plasmid analysis and determination of aminoglycoside ...","content":"Aminoglycoside resistance pattern determinations revealed the presence of the \u003cb\u003eANT(4\u0026#39;)-I\u003c/b\u003e enzyme (aminoglycoside 4\u0026#39; adenyltransferase) in all group 1 isolates \u003cb\u003e...\u003c/b\u003e"},{"GsearchResultClass":"GwebSearch","unescapedUrl":"http://ukpmc.ac.uk/articles/PMC88306","url":"http://ukpmc.ac.uk/articles/PMC88306","visibleUrl":"ukpmc.ac.uk","cacheUrl":"","title":"Identification of Aminoglycoside-Modifying Enzymes by \u003cb\u003e...\u003c/b\u003e","titleNoFormatting":"Identification of Aminoglycoside-Modifying Enzymes by ...","content":"The technique used three sets of primers delineating specific DNA fragments of the aph(3\u0026#39;)-III, \u003cb\u003eant(4\u0026#39;)-I\u003c/b\u003e, and aac(6\u0026#39;)-aph(2\u0026quot;) genes, which influence the MICs of \u003cb\u003e...\u003c/b\u003e"}],"cursor":{"resultCount":"342","pages":[{"start":"0","label":1},{"start":"4","label":2},{"start":"8","label":3},{"start":"12","label":4},{"start":"16","label":5},{"start":"20","label":6},{"start":"24","label":7},{"start":"28","label":8}],"estimatedResultCount":"342","currentPageIndex":0,"moreResultsUrl":"http://www.google.com/search?oe\u003dutf8\u0026ie\u003dutf8\u0026source\u003duds\u0026start\u003d0\u0026hl\u003den\u0026q\u003dAnT4i","searchResultTime":"0.25"}}, "responseDetails": null, "responseStatus": 200}
see http://googlesystem.blogspot.hu/2008/04/google-search-rest-api.html

Related

"AttributeError" in web scraping using python

When I executed that same code on my laptop using Jupyter, I got the following error
AttributeError
Traceback (most recent call last) in # form cycles)
excludedPages = filter(isInternalNode, getChildren("http://www.quora.com/directory"))
-->excludedPages.append("http://www.quora.com")
excludedPages.append("http://www.quora.com#")
excludedPages.append("http://www.quora.com/")
AttributeError: 'filter' object has no attribute 'append'
The code is here- https://github.com/jessicatysu/quora/blob/master/numfollowers.py
This code is for Python 2 - you can see print without () which works only in Python 2.
But Python 2 has also other differences. In Python 2 filter() creates list but in Python 3 filter() is "lazy" and it doesn't create list at once and in some situations you have to use list() to convert filter() to list
And you have to do it before you use append()
excludedPages = list(filter(...))
EDIT:
Here is code which runs without error.
In code you can see comments # changed with more information.
But problem is that this script is 7 years old (from 2013) and Quora changed HTML:
starting page /directory doesn't display list of users
(probably for security or GDPR General Data Protection
Regulation)
it uses JavaScript but mechanize can't run JavaScript
so code is useless :)
You would have to use Selenium to control real web browser which can run JavaScript. And you would have to analyze web pages to create new code.
# Grabs N people randomly from the directory using reservoir sampling, then
# counts the number of followers they have. I never got to run this script
# to completion because Quora blocked the script before I added the rate
# limits.
import mechanize
import random
import http.cookiejar as cookielib # changed: in Python 3 module `cookielib` was renamed to `http.cookiejar`
import re
from time import sleep
NUM_SAMPLES = 1000
FOLLOWERS_FILE = "followers.txt"
USERS_FILE = "users.txt"
ERR_LOG = "errors.txt"
err = open(ERR_LOG, 'w')
# Randomly chosen Quora users (written in the form of links to Quora
# profiles)
users = []
curUserIdx = 1
# Regular expressions that will be used multiple times
leaf = re.compile("-") # Separator between first and last names!
internalNode = re.compile("directory/page")
fnum = re.compile("Followers.*>([0-9]+)<.*Following")
# We use this function to open pages instead of br.open to avoid putting a
# high load on Quora's servers. This means the script takes a lot longer
# though - estimated time 1 day for 2 million users. (21400 page accesses
# * 4 seconds per access = 23.8 hours.)
def openPage(site):
print('[DEBUG] openPage:', site) # changed: add only for debug
result = br.open(site) # changed: add `result =`
sleep(3)
return result # changed: add `return result`
# Gets child links
def getChildren(node):
try:
openPage(node)
print(br.links())
return ["http://www.quora.com" + link.url for link in br.links()]
except:
print("Could not get children of " + node)
err.write("Could not get children of " + node)
return []
# Checks to see if the link is a user profile.
def isLeaf(node):
return leaf.search(node)
# Checks to see if the link is an intermediate node in the directory.
def isInternalNode(node):
return internalNode.search(node)
# Checks to see if the page is part of the people directory
def inPeopleDirectory(node):
try:
page = openPage(node)
html = page.read()
except Exception as ex: # changed: display some info about problem
print('ex:', ex) # changed: display some info about problem
print("Could not open site " + node)
err.write("Could not open site " + node)
return False
# --- change : add decode with try/except ---
try:
html = html.decode('utf-8')
except:
print("Could not decode HTML using UTF-8 " + node)
err.write("Could not decode HTML using UTF-8 " + node)
return False
# --- change : end ---
return "People on Quora" in html
# Applies reservoir sampling to a candidate leaf
def sample(node):
# curUserIdx is 1-indexed
global users, curUserIdx
# Initialize the list
if (curUserIdx <= NUM_SAMPLES):
users.append(node)
# Replace elements
else:
# random.randint chooses a random integer, inclusive
choice = random.randint(1, curUserIdx)
if (choice <= NUM_SAMPLES):
users[choice - 1] = node
curUserIdx += 1
# Gets the number of followers for a user
def getFollowers(profile):
try:
page = openPage(profile)
m = fnum.search(page.read())
if m:
return m.group(1)
except:
print("Could not get number of followers for " + profile)
err.write("Could not get number of followers for " + profile)
# Traverses the tree using depth first search.
def crawl(node):
for child in getChildren(node):
if child in excludedPages:
pass
elif isLeaf(child):
print("Sampling " + child)
sample(child)
elif isInternalNode(child):
print("Crawling internal node " + child)
crawl(child)
else:
print("Passing on link " + child)
# Initialize browser
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; \
rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# Get list of top level pages (and exclude them from searches, because they
# form cycles)
excludedPages = list(filter(isInternalNode, getChildren("https://www.quora.com/directory"))) # changed: add `list()`
excludedPages.append("https://www.quora.com")
excludedPages.append("https://www.quora.com#")
excludedPages.append("https://www.quora.com/")
excludedPages.append("https://www.quora.com/about/tos")
print('[DEBUG] topPages:', list(excludedPages)) # changed: add only for debug
topPages = filter(inPeopleDirectory, excludedPages)
print('[DEBUG] topPages:', list(topPages)) # changed: add only for debug
# Access Quora directory (it's public access!)
for page in topPages:
crawl(page)
# Get followers for each user
ff = open(FOLLOWERS_FILE, 'w')
uf = open(USERS_FILE, 'w')
# Write these in two separate steps in case something goes wrong with
# getFollowers. I don't want to lose my random sample, because that is the
# hardest part to get.
for u in users:
uf.write(u + "\n")
uf.close()
for u in users:
numFollowers = getFollowers(u)
if numFollowers:
ff.write(u + "\t" + getFollowers(u) + "\n")
ff.close()
err.close()

Using data from one request for second url request in same script python

What I have so far is the first request gathering Id's. I would then like to use that return draftgroupid to insert into the second url request. Is it possible to send two requests in the same script, and if so how would I do a for i in range(draftgroupid) in the second url request?
import requests
import json
req1 = requests.get(url="https://www.draftkings.com/lobby/getcontests?sport=NHL")
req.raise_for_status()
data = req.json()
for i, contest in enumerate(data['DraftGroups']):
draftgroupid = contest['DraftGroupId']
Output of draftgroupid:
16901
16905
16902
16903
req2 = requests.get(url="https://api.draftkings.com/draftgroups/v1/draftgroups/THEVALUEIWANTTOLOOPTHROUGH/draftables?format=json")
EDIT
import csv
import requests
import json
req = requests.get(url="https://www.draftkings.com/lobby/getcontests?sport=NHL")
req.raise_for_status()
data = req.json()
for i, contest in enumerate(data['DraftGroups']):
draftgroupid = contest['DraftGroupId']
req2 = requests.get(url="https://api.draftkings.com/draftgroups/v1/draftgroups/" + str(draftgroupid) + "/draftables?format=json")
data2 = req2.json
for i, player_info in enumerate(data2['draftables'][0]):
date = player_info['competition']['startTime']
print(date)
Running into a TypeError: 'method' object is not subscriptable
As I understand, your problem is related to string manipulation rather than for the request library.
So basically,
import requests
import json
req1 = requests.get(url="https://www.draftkings.com/lobby/getcontests?sport=NHL")
req.raise_for_status()
data = req.json()
for i, contest in enumerate(data['DraftGroups']):
draftgroupid = contest['DraftGroupId']
requests.get(url="https://api.draftkings.com/draftgroups/v1/draftgroups/" + str(draftgroupid) + "/draftables?format=json")
should do the job.
More elegant ways to concatenate strings can be found at http://www.pythonforbeginners.com/concatenation/string-concatenation-and-formatting-in-python
Edit
For example,
"some string " + str(123)
"some string %d" % 123
"some string %s" % 123
Will all give the same output. There are more ways to concatenate strings. You just need to choose the best fit based on the context.
for i, contest in enumerate(data['DraftGroups']):
draftgroupid = contest['DraftGroupId']
req2 = requests.get(url="https://api.draftkings.com/draftgroups/v1/draftgroups/%d/draftables?format=json" % draftgroupid)
I assume you didn't actually mean for i in range(draftgroupid) as you stated in the question, because that would mean making 16901 requests, followed by 16905 requests (all of which except the last four would be duplicates of the first batch), followed by 16902 requests (of which all would be duplicates), etc.

Correcting to the correct URL

I have written a simple script to access JSON to get the keywords needed to be used for the URL.
Below is the script that I have written:
import urllib2
import json
f1 = open('CatList.text', 'r')
f2 = open('SubList.text', 'w')
lines = f1.read().splitlines()
for line in lines:
url ='https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle='+line+'&cmlimit=100'
json_obj = urllib2.urlopen(url)
data = json.load(json_obj)
for item in data['query']:
for i in data['query']['categorymembers']:
print i['title']
print '-----------------------------------------'
f2.write((i['title']).encode('utf8')+"\n")
In this script, the program will first read CatList which provides a list of keywords used for the URL.
Here is a sample of what the CatList.text contains.
Category:Branches of geography
Category:Geography by place
Category:Geography awards and competitions
Category:Geography conferences
Category:Geography education
Category:Environmental studies
Category:Exploration
Category:Geocodes
Category:Geographers
Category:Geographical zones
Category:Geopolitical corridors
Category:History of geography
Category:Land systems
Category:Landscape
Category:Geography-related lists
Category:Lists of countries by geography
Category:Navigation
Category:Geography organizations
Category:Places
Category:Geographical regions
Category:Surveying
Category:Geographical technology
Category:Geography terminology
Category:Works about geography
Category:Geographic images
Category:Geography stubs
My program get the keywords and placed it in the URL.
However I am not able to get the result.I have checked the code by printing the URL:
import urllib2
import json
f1 = open('CatList.text', 'r')
f2 = open('SubList2.text', 'w')
lines = f1.read().splitlines()
for line in lines:
url ='https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle='+line+'&cmlimit=100'
json_obj = urllib2.urlopen(url)
data = json.load(json_obj)
f2.write(url+'\n')
The result I get is as follows in sublist2:
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Branches of geography&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geography by place&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geography awards and competitions&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geography conferences&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geography education&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Environmental studies&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Exploration&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geocodes&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geographers&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geographical zones&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geopolitical corridors&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:History of geography&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Land systems&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Landscape&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geography-related lists&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Lists of countries by geography&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Navigation&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geography organizations&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Places&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geographical regions&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Surveying&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geographical technology&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geography terminology&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Works about geography&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geographic images&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Geography stubs&cmlimit=100
It shows that the URL is placed correctly.
But when I run the full code it was not able to get the correct result.
One thing I notice is when I place in the link to the address bar for example:
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Branches of geography&cmlimit=100
It gives the correct result because the address bar auto corrects it to :
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Branches%20of%20geography&cmlimit=100
I believe that if %20 is added in place of an empty space between the word " Category: Branches of Geography" , my script will be able to get the correct JSON items.
Problem:
But I am not sure how to modify this statement in the above code to get the replace the blank spaces that is contained in CatList with %20.
Please forgive me for the bad formatting and the long post, I am still trying to learn python.
Thank you for helping me.
Edit:
Thank you Tim. Your solution works:
url ='https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle='+urllib2.quote(line)+'&cmlimit=100'
It was able to print the correct result:
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3ABranches%20of%20geography&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeography%20by%20place&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeography%20awards%20and%20competitions&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeography%20conferences&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeography%20education&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AEnvironmental%20studies&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AExploration&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeocodes&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeographers&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeographical%20zones&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeopolitical%20corridors&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AHistory%20of%20geography&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3ALand%20systems&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3ALandscape&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeography-related%20lists&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3ALists%20of%20countries%20by%20geography&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3ANavigation&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeography%20organizations&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3APlaces&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeographical%20regions&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3ASurveying&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeographical%20technology&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeography%20terminology&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AWorks%20about%20geography&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeographic%20images&cmlimit=100
https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AGeography%20stubs&cmlimit=100
use urllib.quote() to replace special characters in an url:
Python 2:
import urllib
line = 'Category:Branches of geography'
url ='https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=' + urllib.quote(line) + '&cmlimit=100'
https://docs.python.org/2/library/urllib.html#urllib.quote
Python 3:
import urllib.parse
line = 'Category:Branches of geography'
url ='https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=' + urllib.parse.quote(line) + '&cmlimit=100'
https://docs.python.org/3.5/library/urllib.parse.html#urllib.parse.quote

KeyError and TypeError in my python web scraper

So sorry about this vague and confusing title. But there is no really better way for me to summarize my problem in one sentence.
I was trying to get the student and grade information from a french website. The link is this (http://www.bankexam.fr/resultat/2014/BACCALAUREAT/AMIENS?filiere=BACS)
My code is as follows:
import time
import urllib2
from bs4 import BeautifulSoup
regions = {'R\xc3\xa9sultats Bac Amiens 2014':'/resultat/2014/BACCALAUREAT/AMIENS'}
base_url = 'http://www.bankexam.fr'
tests = {'es':'?filiere=BACES','s':'?filiere=BACS','l':'?filiere=BACL'}
for i in regions:
for x in tests:
# create the output file
output_file = open('/Users/student project/'+ i + '_' + x + '.txt','a')
time.sleep(2) #compassionate scraping
section_url = base_url + regions[i] + tests[x] #now goes to the x test page of region i
request = urllib2.Request(section_url)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response,'html.parser')
content = soup.find('div',id='zone_res')
for row in content.find_all('tr'):
if row.td:
student = row.find_all('td')
name = student[0].strong.string.encode('utf8').strip()
try:
school = student[1].strong.string.encode('utf8')
except AttributeError:
school = 'NA'
result = student[2].span.string.encode('utf8')
output_file.write ('%s|%s|%s\n' % (name,school,result))
# Find the maximum pages to go through
if soup.find('div','pagination'):
import re
page_info = soup.find('div','pagination')
pages = []
for i in page_info.find_all('a',re.compile('elt')):
try:
pages.append(int(i.string.encode('utf8')))
except ValueError:
continue
max_page = max(pages)
# Now goes through page 2 to max page
for i in range(1,max_page):
page_url = '&p='+str(i)+'#anchor'
section2_url = section_url+page_url
request = urllib2.Request(section2_url)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response,'html.parser')
content = soup.find('div',id='zone_res')
for row in content.find_all('tr'):
if row.td:
student = row.find_all('td')
name = student[0].strong.string.encode('utf8').strip()
try:
school = student[1].strong.string.encode('utf8')
except AttributeError:
school = 'NA'
result = student[2].span.string.encode('utf8')
output_file.write ('%s|%s|%s\n' % (name,school,result))
A little more description about the code:
I created a 'regions' dictionary and 'tests' dictionary because there are 30 other regions I need to collect and I just include one here for showcase. And I'm just interested in the test results of three tests (ES, S, L) and so I created this 'tests' dictionary.
Two errors keep showing up,
one is
KeyError: 2
and the error is linked to line 12,
section_url = base_url + regions[i] + tests[x]
The other is
TypeError: cannot concatenate 'str' and 'int' objects
and this is linked to line 10.
I know there is a lot of information here and I'm probably not listing the most important info for you to help me. But let me know how I can do to fix this!
Thanks
The issue is that you're using the variable i in more than one place.
Near the top of the file, you do:
for i in regions:
So, in some places i is expected to be a key into the regions dictionary.
The trouble comes when you use it again later. You do so in two places:
for i in page_info.find_all('a',re.compile('elt')):
And:
for i in range(1,max_page):
The second of these is what is causing your exceptions, as the integer values that get assigned to i don't appear in the regions dict (nor can an integer be added to a string).
I suggest renaming some or all of those variables. Give them meaningful names, if possible (i is perhaps acceptable for an "index" variable, but I'd avoid using it for anything else unless you're code golfing).

How can I check the value of a DNS TXT record for a host?

I'm looking to verify domain ownership via a script, specifically a Python script, and would like know how to lookup the value of a DNS TXT entry. I know there are services and websites out there for this, but I would like to do it with a script.
This is easy using dnspython. Here is an example:
import dns.resolver
print dns.resolver.resolve("aaa.asdflkjsadf.notatallsuspicio.us","TXT").response.answer[0][-1].strings[0]
This gives the following output:
PnCcKpPiGlLfApDbDoEcBbPjIfBnLpFaAaObAaAaMhNgNbIfPbHkMiEfPpGgJfOcPnLdDjBeHkOjFjIbPbIoKhIjHfJlAhAhFgGbGgNlMgKmFkLgNfBjMbCoBeNbGeOnAeHgLmKoFlLhLmDcKlEdEbDpFeHkFaBlGnHiOnChIoMlIhBgOnFfKoEhDnFkKfDaMgHbJhMgPgMjGiAoJpKjKkPaIcAdGiMbIbBbAfEiKjNbCeFoElKgOePmGjJaImL
Another option is to use dig in subprocess:
import subprocess
print subprocess.Popen(["dig","-t","txt","aaa.asdflkjsadf.notatallsuspicio.us","+short"], stdout=subprocess.PIPE).communicate()[0]
This may be overly simplified, but if all you want is a quick read of the TXT record and don't mind dealing with parsing the result separately:
nslookup -q=txt somedomain.com
I found this did what I needed, short & sweet.
Found another way to get list of all TXT records for a domain using dnspython.
import dns.resolver
[dns_record.to_text() for dns_record in dns.resolver.resolve("your-domain-here", "TXT").rrset]
update 2022/11/20
# -*- coding:utf-8 -*-
# Copyright (c) DadouLab.SIG MIT
import dns
import dns.query
import dns.resolver
import logging
logger = logging.getLogger(__name__)
class Digger(object):
def __init__(self, resolvers=["1.1.1.1"]):
self.mResolver = dns.resolver.Resolver()
self.mResolver.timeout = 1
self.mResolver.lifetime = 0.5
self.mResolver.nameservers = resolvers
self.spec_query_type = ['CNAME', 'TXT', 'MX', 'NS', 'SRV', 'CAA']
def query(self, domain, query_type="A"):
"""
answer = dns.resolver.resolve("_dnsauth.test.com", "TXT").rrset
for dns_record in answer:
print(dns_record.to_text())
"""
try:
query_type = query_type.upper()
answer = self.mResolver.resolve(domain, query_type, raise_on_no_answer=False)
answer_raw = answer.chaining_result.answer.to_text()
logger.info("resolved response data => {}".format(answer_raw))
if query_type in self.spec_query_type:
records = [data.to_text() for data in answer]
else:
records = [data.address for data in answer]
return records
except (dns.resolver.NXDOMAIN, dns.resolver.NoAnswer,
dns.resolver.NoNameservers, dns.exception.Timeout) as error:
logger.warning("resolved error => {}".format(error))
return
def is_valid(self, domain, query_type="A"):
try:
self.mResolver.resolve(domain, query_type, raise_on_no_answer=False)
return True
except (dns.resolver.NXDOMAIN, dns.resolver.NoAnswer,
dns.resolver.NoNameservers, dns.exception.Timeout) as error:
logger.warning("resolved error => {}".format(error))
return
if __name__ == '__main__':
dig = Digger()
print(dig.query("www.example.com", query_type="A"))
Something like this should work to at least get the value for the URL, I used google.com for the example.
import pycurl
import StringIO
url = "whatsmyip.us/dns_txt.php?host=google.com"
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
txtcurl = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, txtcurl.write)
c.perform
data = txtcurl.getvalue()
data = data.replace("Done!", "")
print data
I did not test any of this but pulled it from a previous project.
Best of luck!

Categories