How to properly compare 2 JSON request response strings in Python

How to properly compare 2 JSON request response strings in Python - python

I want to compare 2 Python response strings and print out the differences, here is my code right now:
import requests
import json
import time
getFirst = requests.get("https://api-mainnet.magiceden.dev/v2/collections?offset=0&limit=1")
liveRN = json.dumps(getFirst.json(), indent=4)
while True:
get = requests.get("https://api-mainnet.magiceden.dev/v2/collections?offset=0&limit=1")
dataPretty = json.dumps(get.json(), indent=4)
data = get.json()
if get.status_code == 200:
print("ok")
if dataPretty != data:
for item in data:
if str(item) not in liveRN:
send = 1
print(f"Found difference: {item}")
symbol = item['symbol']
img = item['image']
name = item['name']
description = item['description']
print(symbol)
print(img)
print(name)
else:
print("Didnt find")
else:
print("No change")
time.sleep(15)
I only want to print the items when the two repsonses dont match but right now its printing the items I want even when the strings do match.
I tried to see add another if condition where if the 2 request response match it wont do anything and just pass but that didnt work

You can use sets to find whether items of the dictionary are changed or not. I've used the compare code from another question but this is somewhat what you can use for your problem
import requests
import time
def dict_compare(d1, d2):
d1_keys = set(d1.keys())
d2_keys = set(d2.keys())
shared_keys = d1_keys.intersection(d2_keys)
added = d1_keys - d2_keys
removed = d2_keys - d1_keys
modified = {o : (d1[o], d2[o]) for o in shared_keys if d1[o] != d2[o]}
same = set(o for o in shared_keys if d1[o] == d2[o])
return added, removed, modified, same
first = requests.get("https://api-mainnet.magiceden.dev/v2/collections?offset=0&limit=1").json()[0]
while True:
get_second = requests.get("https://api-mainnet.magiceden.dev/v2/collections?offset=0&limit=1")
if get_second.status_code == 200:
print("ok")
second = get_second.json()[0]
added, removed, modified, same = dict_compare(first, second)
if len(added) > 0 or len(modified) > 0 or len(removed) > 0:
print("added: ", added)
print("modified: ", modified)
print("removed: ", removed)
else:
print("No change")
time.sleep(15)

Related

Python implementation with github

How I can fix the following bug for the comments from commits that has been merged are not downloaded? I think there is a problem from REST to GraphQL but I don't know exactly how to fix.
The following script is used for:
Get all all quantified activities from a set of GitHub repositories. The
list of repos to be searched for are found in the repos.lst file.
Authentication to GitHub is also stored in separate file called github.token
containing the username and password/access token on two separate lines.
Quantified activities include merged pull-requests, closed issues (except for
those explicitly removed from the list for not being constructive) and comments.
Pull-requests:
Only the closed pull-requests are listed, and their merge status determined,
finally the merged pull-requests are stored in a JSON file, with entries
containing the reference number, the repository, the title, the author and the
pull-request URL.
Issues:
Only the closed issues are listed, the pull-requests (which are treated also as
issues by the GitHub API) removed from them, isseues blacklisted in the
blacklist.lst file are also removed from the list, finally the remaining
issues are stored in a JSON file, with entries containing the reference number,
the repository, the title, the author and the issue URL.
Comments:
Comments from the commits, from the issues and from the pull-requests are all
listed and stored in JSON file with entries containing the author, the comment
ID, the repository, the comment content and the comment's URL. Issues comments
and pull-request comments will also contain the reference number of the issue
respectively the pull-request, with issues additionally having also the
original author of the issue, while the commit comments will contain the SHA1 of
the commit.
#!/usr/bin/env python3
# requests library is used to access GitHub API
import requests
import json
import re
import sys
import os.path
import bisect
import getpass
from itertools import chain
import repos
year = repos.get_year()
path = '../../{}/rezultate/'.format(year)
# read the list of repositories
repo_name = re.compile(r'([\w-]+)/([\w-]+)')
repos = [('etc-so', 'syllabus')]
with open(path + 'data/repos.lst', 'r') as f:
for s in f:
s = s.strip()
# ignore empty lines
if s and not s.startswith('#'):
m = repo_name.fullmatch(s)
if not m:
print("Invalid repo name: " + s)
repos.append((m.group(1), m.group(2)))
# read the list of students
students = []
with open(path + 'data/students.json', 'r') as f:
students = [x['user'] for x in json.load(f)]
if not students:
print("No students to check for")
sys.exit(1)
# get the access token
if os.path.exists('github.token'):
with open('github.token', 'r') as f:
auth = (f.readline().strip(), f.readline().strip())
else:
auth = (input("Enter GitHub username: "), getpass.getpass('Password: '))
# construct a labels list, so it can be added to the comments as well
issue_labels = {}
# get the persistent PR, issue and comment lists
if os.path.exists(path + 'data/pulls.json'):
with open(path + 'data/pulls.json', 'r') as f:
saved_pulls = dict((x['id'],(x['week'], x['value'], x['author']) + ((x['multi'],) if 'multi' in x else ())) for x in json.load(f))
else:
saved_pulls = {}
if os.path.exists(path + 'data/issues.json'):
with open(path + 'data/issues.json', 'r') as f:
saved_issues = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_issues = {}
if os.path.exists(path + 'data/comments.json'):
with open(path + 'data/comments.json', 'r') as f:
saved_comments = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_comments = {}
current_week = 0
# if there were already some activity then move ahead to the last week
if saved_pulls or saved_issues or saved_comments:
current_week = max(x[0] for x in chain(saved_pulls.values(),
saved_issues.values(),
saved_comments.values()))
# if there is a command line argument use it as current week
if len(sys.argv) == 2:
if sys.argv[1].isdigit():
current_week = int(sys.argv[1])
# -n increments the current week if it can be obtained from the activity
elif sys.argv[1] == '-n' and current_week != 0:
current_week += 1
print("Switching to week {}".format(current_week))
elif sys.argv[1] == '?':
print("Current week is {}".format(current_week))
sys.exit(0)
else:
print("Invalid command line parameter")
sys.exit(1)
elif len(sys.argv) > 2:
print("Too many parameters")
sys.exit(1)
# if no current week was obtained, start with week 1
if not current_week:
current_week = 1
api_url = "https://api.github.com/"
get_url = "repos/{}/{}/{}"
c_params = {
'state': 'closed', # get closed pull-requests/issues
'sort': 'created',
'direction': 'asc', # sort it in ascending order by their creation time
}
pr = []
pr_times = []
gi = []
comments = []
c_times = []
authors = {}
issue_comment_re = re.compile(r'.*/(pull|issues)/([0-9]+)#.*')
link_rel = re.compile(r'<([^>]+)>; rel="([\w]+)"(?:, )?')
request_count = 0
def github_get(get_code, **params):
global request_count
global raw_data
res = []
url = api_url + get_url.format(org, repo, get_code)
while url:
r = requests.get(url, auth=auth, params=params)
if r.status_code != 200:
print('Status code {} received'.format(r.status_code))
print(r.content)
sys.exit(1)
links = dict((m[1],m[0]) for m in link_rel.findall(r.headers.get('link', '')))
url = links.get('next')
res.extend(r.json())
request_count = r.headers['X-RateLimit-Remaining']
return res
label_values = {
'bug': 2,
'enhancement': 2,
'challenge': 2,
'help wanted': 1,
'invalid': -1,
'duplicate': -2
}
def label_value(labels):
# check predefined values
values = [label_values[x] for x in labels if x in label_values]
values += [x.count('★') for x in labels]
if values:
if min(values) < 0:
return min(values)
value = max(values)
else:
value = 0
# add all white stars
value += sum(x.count('☆') for x in labels)
return value
def issue_value(value, issue_id=None):
if issue_id:
old_value = saved_issues.get(issue_id)
if old_value and old_value[1] != value:
print("Value change detected in issue {}".format(issue_id))
return value
def pull_value(label, pull_id=None, ref=0):
if label:
value = label_value(label)
else:
value = None
print("No label for pull-request {}".format(ref))
if pull_id:
old_value = saved_pulls.get(pull_id)
if old_value and old_value[1] is not None and old_value[1] != value:
print("Value change detected in pull-request {}".format(ref))
if value is None:
value = old_value[1]
return value
ptotal = 0
itotal = 0
ctotal = 0
# pass through the repos
for org,repo in repos:
print('Processing repo {}:'.format(repo), end=' ', flush=True)
# get all the issues, do this first as it all includes the pull-requests
# for which only here we can determine the labels
issues = github_get('issues', state='all', sort='created', direction='asc')
icount = 0;
for i in issues:
ref = int(i['number'])
author = i['user']['login']
authors[(repo, ref)] = author
label = [x['name'] for x in i['labels']]
issue_labels[(repo, ref)] = label
if 'pull_request' not in i and author in students:
gi.append(
{
'id': i['id'],
'repo': repo,
'ref': ref,
'title': i['title'],
'url': i['html_url'],
'author': author,
'label': label,
'week' : saved_issues.get(i['id'], [current_week])[0],
'value' : issue_value(label_value(label), i['id'])
})
icount += 1
# get the merged pull-requests
pulls = github_get('pulls', state='closed')
pcount = 0;
#print(r.headers)
for p in pulls:
ref = int(p['number'])
author = p['user']['login']
label = issue_labels.get((repo, ref), [])
if p['merged_at'] and author in students and (not label or label_value(label) >= 0):
i = bisect.bisect_right(pr_times, p['merged_at'])
pr_times.insert(i, p['merged_at'])
# check authors of included commits
if p['id'] in saved_pulls:
author = saved_pulls[p['id']][2]
else:
pcommits = github_get('pulls/{}/commits'.format(ref))
pc_authors = [author]
for a in pcommits:
al = a['author'].get('login') if a['author'] else None
cl = a['committer'].get('login') if a['committer'] else None
if al == cl or cl == 'web-flow' or cl == 'kcs':
aa = al
else:
aa = ':'.join(x for x in [al, cl] if x)
if aa and aa not in pc_authors and aa != 'kcs':
pc_authors.append(aa)
if len(pc_authors) != 1:
author = pc_authors
pr.insert(i, {
'id': p['id'],
'repo': repo,
'ref': ref,
'title': p['title'],
'url': p['html_url'],
'label': label,
'author': author,
'week': saved_pulls.get(p['id'], [current_week])[0],
'value': pull_value(label, p['id'], ref)
})
if len(saved_pulls.get(p['id'], [])) > 3:
pr[i]['multi'] = saved_pulls[p['id']][3]
pcount += 1
# and now for the comments:
# this is more troublesome as constructive comments must be selected
# manually, so we are keeping persistent JSON file for the comments
# holding a valid tag, newly downloaded comments have this tag unset
# and they can be validated afterwards manually (or by script)
ccount = 0;
# 3 types of comments exist on GitHub: issue comments, review comments
# and commit comments, all have to be handled separately
ccomments = github_get('comments', per_page=50, sort='created')
#with open('all_ccomments.json', 'w') as f:
# json.dump(ccomments, f, indent=4, sort_keys=True)
for c in ccomments:
author = c['user']['login']
if author in students:
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'commit': c['commit_id'],
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
icomments = github_get('issues/comments', per_page=50, sort='created')
for c in icomments:
author = c['user']['login']
if author in students:
url = c['html_url']
m = issue_comment_re.fullmatch(url)
if not m:
print("Problem parsing issue url " + url)
sys.exit(1)
ref = int(m.group(2))
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'issue': ref,
'msg': c['body'],
'url': url,
'author': author,
'issue_author': authors[(repo, ref)],
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
if m.group(1) == 'issues' and (repo, ref) in issue_labels:
comments[i]['label'] = issue_labels[(repo, ref)]
ccount += 1
pcomments = github_get('pulls/comments', per_page=50, sort='created')
for c in pcomments:
author = c['user']['login']
if author in students:
ref = int(c['pull_request_url'].rsplit('/', 1)[1])
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'pull': ref,
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
print('found {} merged pull-requests, {} issues and {} comments'.format(pcount, icount, ccount))
ptotal += pcount
itotal += icount
ctotal += ccount
with open(path + 'data/pulls.json', 'w') as f:
json.dump(pr, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/issues.json', 'w') as f:
json.dump(gi, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/comments.json', 'w') as f:
json.dump(comments, f, indent=4, sort_keys=True, ensure_ascii=False)
print("Total: {} merged pull-requests, {} issues and {} comments [{} contributions]".format(
ptotal, itotal, ctotal, ptotal + itotal + ctotal))
print("Remaining request count: {}".format(request_count))
repo_key = dict((r[1],i) for i,r in enumerate(repos))
def sort_repos(x):
'''Sort the repos in a blacklist with (repo,ref) structure in the order
as they appear in the repos list.
If repo is not in the list then put them afterwards
'''
if x[0] in repo_key:
return (repo_key[x[0]],x[1])
else:
return (len(repos),) + x

Searching for a filename in a folder

I have a list of ts files inside a folder. I try to extract the content id from the XML which is the filename without extension. I need to search for a ts file that matches the content id. For some reason, it's failing. I am attaching the code below. I am also attaching the screenshot for the ts files.
import glob
import lxml.etree as et
import os, csv
ASSET_METADATA_PATH = '/Users/roradhak/eVision/failed_assets/'
TS_PATH = '/Users/roradhak/eVision/ts_check/'
def parse_file(path):
tree = et.parse(path)
root = tree.getroot()
trailer_id = ""
programs = root.xpath('Program[#title="Program"]')
if len(programs) == 0:
return None, None, None
program = programs[0] # TODO - Are multiple programs expected? If so, the function should return a list of tuples
# Get the Content ID
c_id = program.xpath('props/*[#title="Content ID"]')
if len(c_id) == 0:
content_id = None
else:
content_id = c_id[0].text
# Get the has_trailer attribute
has_t = program.xpath('props/*[#title="Has_Trailer"]')
has_trailer = has_t[0].text
if has_t[0].text =="Y":
trailer_id = content_id.replace('M','T',1)
# Get the content name
n = program.xpath('props/*[#title="Name"]')
if len(n) == 0:
content_name = None
else:
content_name = n[0].text
return content_id, content_name, has_trailer, trailer_id
def main():
asset_metadata = glob.glob(os.path.join(ASSET_METADATA_PATH, u'*.xml'))
movies = glob.glob(os.path.join(TS_PATH, u'*.ts'))
for p in asset_metadata:
print(u'Processing: {p}'.format(p=p).encode('utf-8'))
print content_id, content_name, has_trailer, trailer_id
content_id, content_name, has_trailer, trailer_id= parse_file(p)
if u'{c}.ts'.format(c=content_id) not in TS_PATH:
print "No Movie"
if has_trailer =="Y":
if u'{c}.ts'.format(c=trailer_id) not in movies:
print "No trailer"
if __name__ == '__main__':
main()
Output as below
/Users/roradhak/IVPGET_Local/venv/bin/python /Users/roradhak/Downloads/validate_xml.py
Processing: /Users/roradhak/eVision/failed_assets/E30000001557115265_2019_08_29T11_20_08Z.xml
MD009232 Ep 143 - Cool look Hair style N
No Movie
Processing: /Users/roradhak/eVision/failed_assets/10000000717960000_2019_10_09T15_04_20Z.xml
MZ008931 Aan: Men At Work Y TZ008931
No Movie
No trailer
Processing: /Users/roradhak/eVision/failed_assets/E30000001557537308_2019_08_09T19_15_22Z.xml
MZ010564 EP29 - Episode 29 - Raheem S1 Y TZ010564
No Movie
No trailer
Process finished with exit code 0

Here is how I would do it with pathlib and Python 3.4+:
from pathlib import Path
failed_assets_folder = Path('/Users/roradhak/eVision/failed_assets')
ts_folder = Path('/Users/roradhak/eVision/ts_check')
def main():
for failed_asset in failed_assets_folder.glob('*.xml'):
print(f'Processing: {failed_asset.name}')
content_id, content_name, has_trailer, trailer_id = parse_file(failed_asset.name)
print(f'{content_id}, {content_name}, {has_trailer}, {trailer_id}')
if not Path(ts_folder / f'{content_id}.ts').exists():
print('No Movie')
if has_trailer == 'Y':
if not Path(ts_folder / f'{trailer_id}.ts').exists():
print('No trailer')
It just implements the file search portion and it is not tested though.

Try block gives output even when exception is raised by the last command (but not the first)

I use try/except to catch problems when reading a file line-by-line. The try block contains a series of manipulations, the last of which is usually the cause of the exception. Surprisingly, I noticed that all previous manipulations are executed within the try block even when an exception is raised. This is a problem when trying to turn the dictionary I created to a data frame, because the length of the lists is unequal.
This code creates the problem:
d = {'dates':[],'states':[], 'longitude':[], 'latitude':[], 'tweet_ids':[], 'user_ids':[], 'source':[]}
for file in f:
print("Processing file "+file)
t1 = file.split('/')[-1].split("_")
date = t1[0]
state_code = t1[1]
state = list(states_ref.loc[states_ref.code==state_code]['abbr'])[0]
collection = JsonCollection(file)
counter = 0
for tweet in collection.get_iterator():
counter += 1
try:
d['dates'].append(date)
d['states'].append(state)
t2 = tweet_parser.get_entity_field('geo', tweet)
if t2 == None:
d['longitude'].append(t2)
d['latitude'].append(t2)
else:
d['longitude'].append(t2['coordinates'][1])
d['latitude'].append(t2['coordinates'][0])
#note: the 3 lines bellow are the ones that can raise an exception
temp = tweet_parser.get_entity_field('source', tweet)
t5 = re.findall(r'>(.*?)<', temp)[0]
d['source'].append(t5)
except:
c += 1
print("Tweet {} in file {} had a problem and got skipped".format(counter, file))
print("This is a total of {} tweets I am missing from the {} archive I process.".format(c, sys.argv[1]))
next
tab = pd.DataFrame.from_dict(d)
I have fixed the problem by moving the manipulation that is prone to giving the error at the top, but I would like to better understand why try/except is behaving like this. Any ideas?
This code works:
d = {'dates':[],'states':[], 'longitude':[], 'latitude':[], 'tweet_ids':[], 'user_ids':[], 'source':[]}
for file in f:
print("Processing file "+file)
t1 = file.split('/')[-1].split("_")
date = t1[0]
state_code = t1[1]
state = list(states_ref.loc[states_ref.code==state_code]['abbr'])[0]
collection = JsonCollection(file)
counter = 0
for tweet in collection.get_iterator():
counter += 1
try:
#note: the 3 lines bellow are the ones that can raise an exception
temp = tweet_parser.get_entity_field('source', tweet)
t5 = re.findall(r'>(.*?)<', temp)[0]
d['source'].append(t5)
d['dates'].append(date)
d['states'].append(state)
t2 = tweet_parser.get_entity_field('geo', tweet)
if t2 == None:
d['longitude'].append(t2)
d['latitude'].append(t2)
else:
d['longitude'].append(t2['coordinates'][1])
d['latitude'].append(t2['coordinates'][0])
except:
c += 1
print("Tweet {} in file {} had a problem and got skipped".format(counter, file))
print("This is a total of {} tweets I am missing from the {} archive I process.".format(c, sys.argv[1]))
next
tab = pd.DataFrame.from_dict(d)

You could always use a temporal object to hold the output of your functions before appending to the target object. That way if something fails, it will raise an exception before putting data into the target object.
try:
#Put all data into a temporal Dictionary
#Can raise an exception here
temp = tweet_parser.get_entity_field('source', tweet)
t2 = tweet_parser.get_entity_field('geo', tweet)
tempDictionary = {
"source" : re.findall(r'>(.*?)<', temp)[0],
"latitude" : None if (t2 is None) else t2['coordinates'][1],
"longitude" : None if (t2 is None) else t2['coordinates'][0]
}
#Append data from temporal Dictionary
d['source'].append(tempDictionary['source'])
d['latitude'].append(tempDictionary['latitude'])
d['longitude'].append(tempDictionary['longitude'])
d['dates'].append(date)
d['states'].append(state)
except:
c += 1
print("Tweet {} in file {} had a problem and got skipped".format(counter, file))
print("This is a total of {} tweets I am missing from the {} archive I process.".format(c, sys.argv[1]))

Website Name extract in Python

I want to extract website names from the url. For e.g. https://plus.google.com/in/test.html
should give the output as - "plus google"
Some more testcases are -
WWW.OH.MADISON.STORES.ADVANCEAUTOPARTS.COM/AUTO_PARTS_MADISON_OH_7402.HTML
Output:- OH MADISON STORES ADVANCEAUTOPARTS
WWW.LQ.COM/LQ/PROPERTIES/PROPERTYPROFILE.DO?PROPID=6054
Output:- LQ
WWW.LOCATIONS.DENNYS.COM
Output:- LOCATIONS DENNYS
WV.WESTON.STORES.ADVANCEAUTOPARTS.COM
Output:- WV WESTON STORES ADVANCEAUTOPARTS
WOODYANDERSONFORDFAYETTEVILLE.NET/
Output:- WOODYANDERSONFORFAYETTEVILLE
WILMINGTONMAYFAIRETOWNCENTER.HGI.COM
Output:- WILMINGTONMAYFAIRETOWNCENTER HGI
WHITEHOUSEBLACKMARKET.COM/
Output:- WHITEHOUSEBLACKMARKET
WINGATEHOTELS.COM
Output:- WINGATEHOTELS
string = str(input("Enter the url "))
new_list = list(string)
count=0
flag=0
if 'w' in new_list:
index1 = new_list.index('w')
new_list.pop(index1)
count += 1
if 'w' in new_list:
index2 = new_list.index('w')
if index2 != -1 and index2 == index1:
new_list.pop(index2)
count += 1
if 'w' in new_list:
index3= new_list.index('w')
if index3!= -1 and index3== index2 and new_list[index3+1]=='.':
new_list.pop(index3)
count+=1
flag = 1
if flag == 0:
start = string.find('/')
start += 2
end = string.rfind('.')
new_string=string[start:end]
print(new_string)
elif flag == 1:
start = string.find('.')
start = start + 1
end = string.rfind('.')
new_string=string[start:end]
print(new_string)
The above works for some testcases but not all. Please help me with it.
Thanks

this is something you could build upon; using urllib.parse.urlparse:
from urllib.parse import urlparse
tests = ('https://plus.google.com/in/test.html',
('WWW.OH.MADISON.STORES.ADVANCEAUTOPARTS.COM/'
'AUTO_PARTS_MADISON_OH_7402.HTML'),
'WWW.LQ.COM/LQ/PROPERTIES/PROPERTYPROFILE.DO?PROPID=6054')
def extract(url):
# urlparse will not work without a 'scheme'
if not url.startswith('http'):
url = 'http://' + url
parsed = urlparse(url).netloc
split = parsed.split('.')[:-1] # get rid of TLD
if split[0].lower() == 'www':
split = split[1:]
ret = ' '.join(split)
return ret
for url in tests:
print(extract(url))

The function strips the url from the double slash to the single slash:
the rest is 'clean up'
def stripURL( url, TwoSlashes, OneSlash ):
try:
start = url.index(TwoSlashes) + len(TwoSlashes)
end = url.index( OneSlash, start )
return url[start:end]
except ValueError:
return ""
url= raw_input("URL : ")
if "www." in url:url=url.replace("www.","")
Strip = stripURL( url, "//", "/" )
# Strips anything after the last period found
Stripped = Strip[:Strip.rfind(".")]
# get rid of the any periods used in the name
Stripped = Stripped.replace("."," ")
print Stripped

Python Wiki Path Searching

On a personal whim I have written some code to search for the shortest series of links between any two Wikipedia articles. It turned out to be very brute force and takes a long long time to find the goal if it's more than a link or two deep, but it works! I will eventually keep track of and make use of the link paths and stuff, but I wanted to get the search working optimally first. Is there a faster way to do this or a good way to cut some major corners here?
import urllib2
from bs4 import BeautifulSoup
Start = 'http://en.wikipedia.org/wiki/Alan_Reid_%28politician%29'
End = 'http://en.wikipedia.org/wiki/Ayr'
#Using BeautifulSoup, this grabs the page
def soup_request(target):
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0")
page = urllib2.urlopen(target)
soup = BeautifulSoup(page)
return soup
#This will grab all Wiki links off a given page
def get_links(Start):
soup = soup_request(Start)
Wiki_links = []
#Finds all links
for url in soup.findAll('a'):
result = url.get('href')
try:
if str(result)[:5] == '/wiki':
Wiki_links.append(result)
except:
pass
for q in range(len(Wiki_links)):
Wiki_links[q] = 'http://en.wikipedia.org'+str(Wiki_links[q])
print "Got new links from",Start
return Wiki_links
#This will check all the given links to see if the title matches the goal webpage
def check_links(Links,End):
goalsoup = soup_request(End)
goaltitle = goalsoup.html.title
Found = False
count = 0
for q in Links:
if Found:
break
length = len(Links)
#Runs through all the given links and checks their titles for correct one
if q is not None:
count += 1
soup = soup_request(q)
print "Checked",count,"links out of",length
try:
title = soup.html.head.title
if title == goaltitle:
Found = True
print "Found it!"
break
except:
print 'doh'
pass
return Found
#Top function to do all the stuff in the right order, applying a maximum depth of how deep into the links
def wiki_crawl(Start, End, depth):
Old_Links = [Start]
count = depth
while count > 0:
New_Links = []
for q in range(len(Old_Links)):
New_Links.extend(get_links(Old_Links[q]))
Found = check_links(New_Links,End)
if Found:
print "All done."
break
Old_Links = New_Links
count -= 1
print "_______________________________________________________________ROUND DONE"
if not Found:
print "Did not find the page, you must go deeper!"
wiki_crawl(Start, End, 2)

Here are some functions to take info from wiki. The only problems with it is that sometimes it takes out a space from the info on the webpage.
def take_out_parenthesis(st):
string = list(st)
for a in string:
if a == '(':
del string[st.find(a)]
if a == ')':
del string[st.find(a) - 1]
return ''.join(string)
def take_out_tags(string):
st = list(string)
odd = ['<', '>']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('<') - 1
end = string.find('>')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_out_brackets(string):
st = list(string)
odd = ['[', ']']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('[') - 1
end = string.find(']')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_from_web_page(text):
n = 0
url = text.replace(" ", "_")
search = "http://en.wikipedia.org/wiki/%s" % url
page = urllib2.urlopen(search).read()
start = page.find('<p><b>') + 6
end = page.find('</a>.', start) + 5
new_page = page[start:end]
for a in new_page:
if a == '<':
if new_page[n - 1] != ' ':
lst = list(new_page)
lst.insert(n, ' ')
new_page = ''.join(lst)
n += 1
n += 1
return take_out_parenthesis(take_out_brackets(take_out_tags(new_page)))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to properly compare 2 JSON request response strings in Python - python

Related

Python implementation with github

Searching for a filename in a folder

Try block gives output even when exception is raised by the last command (but not the first)

Website Name extract in Python

Python Wiki Path Searching

Categories

Resources