Currently listed solution to .replace() not fixing my issue - python

I have just tried the syntax suggested on a previous question similar to mine and it has not worked for me. I have tried:
newvar = str(oldvar)
newvar = newvar.replace('\r', '').replace('\n', ' ')
and I have tried:
newvar = oldvar.replace('\r', '').replace('\n', ' ')
My full (relevant) sections of code are here:
URLS = myurls2.values()
# Retrieve a single page and report the url and contents
def load_url(key, url, timeout):
conn = urllib.request.urlopen(url, timeout=timeout)
return conn.readall()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, key, url, 60): (key, url)
for key, url in myurls2.items()}
c = 0
for future in concurrent.futures.as_completed(future_to_url):
key, url = future_to_url[future]
try:
data = str(future.result())
data2 = str(data)
data2 = data2.replace('\r', '').replace('\n', ' ')
print(data2)
d = open(filepath,"w")
d.write(data2)
d.close()
Dictionary values are converted to urls that are submitted to the relevant website then at the end I wish to convert them into a string and remove the characters listed in the replace statement.
Thanks

Related

Parsing query parameters in Python

The problem lies somewhere in how I'm parsing and or reassembling urls. I'm losing the ?id=1 and getting ?d=1.
What I am trying to do is have the ability to manipulate and query parameter and reassemble it before sending back out modified. Meaning the dictionaries would be modified than using urlencode(modified_dict) I would reassemble url + query.
Can someone give me a pointer on what I'm doing wrong here.
from urlparse import parse_qs, urlparse , urlsplit
from urllib import urlencode
import os
import sys
import mechanize
from collections import OrderedDict
import urllib2
scrape_post_urls = []
get_inj_tests = []
#check multiple values to strip out duplicate and useless checks
def parse_url(url):
parsed = urlparse(url,allow_fragments=False)
if parsed.query:
if url not in get_inj_tests:
get_inj_tests.append(url)
#print url
'''get_inj_tests.append(url)
print url
#print 'scheme :', parsed.scheme
#print 'netloc :', parsed.netloc
print 'path :', parsed.path
print 'params :', parsed.params
print 'query :', parsed.query
print 'fragment:', parsed.fragment
#print 'hostname:', parsed.hostname, '(netloc in lower case)'
#print 'port :', parsed.port
'''
else:
if url not in scrape_post_urls:
scrape_post_urls.append(url)
#print url
def main():
unparsed_urls = open('in.txt','r')
for urls in unparsed_urls:
try:
parse_url(urls)
except:
pass
print(len(scrape_post_urls))
print(len(get_inj_tests))
clean_list = list(OrderedDict.fromkeys(get_inj_tests))
reaasembled_url = ""
#print clean_list
for query_test in clean_list:
url_object = urlparse(query_test,allow_fragments=False)
#parse query paramaters
url = query_test.split("?")[1]
dicty = {x[0] : x[1] for x in [x.split("=") for x in url[1:].split("&") ]}
query_pairs = [(k,v) for k,vlist in dicty.iteritems() for v in vlist]
reaasembled_url = "http://" + str(url_object.netloc) + str(url_object.path) + '?'
reaasembled_query = urlencode(query_pairs)
full_url = reaasembled_url + reaasembled_query
print dicty
main()
Can someone give me a pointer on what I'm doing wrong here.
Well quite simply you're not using the existing tools:
1/ to parse a query string, use urllib.parse.parse_qsl().
2/ to reassemble the querystring, use urllib.parse.urlencode().
And forget about dicts, querystrings can have multiple values for the same key, ie ?foo=1&foo=2 is perfectly valid.
first of all, your variable url is a bad name for the params variable and this could create confusion.
>>> url = "https://url.domian.com?id=22&param1=1&param2=2".split("?")[1]
'id=22&param1=1&param2=2'
>>> "https://url.domian.com?id=22&param1=1&param2=2".split("?")[1].split("&")
['id=22', 'param1=1', 'param2=2']
The error is in the url[1:].split("&")
Solution:
>>> dicty = {x[0] : x[1] for x in [x.split("=") for x in url.split("&") ]}
{'id': '22', 'param1': '1', 'param2': '2'}

How to change the value of the parameter in python?

NOTE: There is no fix url for it. Means it is not possible to see this url always. I want code which works for all the urls.
For ex, http://januapp.com/demo/search.php?search=aaa
http://januapp.com/demo/search.php?other=aaa
Now I want to change it to
http://januapp.com/demo/search.php?search=bbb
http://januapp.com/demo/search.php?other=bbb
I don't know how can I do it?
I tried this
import optparse
import requests
import urlparse
parser = optparse.OptionParser()
parser.add_option("-t","--Host", dest="Target", help="Please provide the target", default="true")
options, args = parser.parse_args()
url = options.Target
xss = []
xss.append("bbb")
try:
url2 =urlparse.urlparse(url)
print url2
url3 = urlparse.parse_qs(url2.query)
parametervalue = [key for key, key in url3.iteritems()] #[['aaa']]
parsed = parametervalue.append(xss[0])
print parsed
finalurl = urljoin(url, parsed)
print finalurl
except Exception as e:
print e
So when I pass this
xss3.py -t http://januapp.com/demo/search.php?search=aaa
The Error occurs below on to the cmd
ParseResult(scheme='http', netloc='januapp.com', path='/demo/search.php', params='', query='search=aaa', fragment='')
None
name 'urljoin' is not defined
See the None
Now that's the problem,
I am using Python2.7.
Thank you very much. Hope you get the problem.
You can try something with this kind of approach.
url = 'http://januapp.com/demo/search.php?search=aaa'
# First get all your query params
arr = url.split('?')
base_url = arr[0] # This is your base url i.e. 'http://januapp.com/demo/search.php'
params = arr[1] # here are your query params ['search=aaa']
# Now seprate out all the query parameters and their values
arr2 = params.split("=") # This will give you somrthing like this : ['search', 'aaa'], the the value will be next to the key
# This is a dictonary to hold the key value pairs
param_value_dict = {} # {'search': 'aaa'}
for i, str in enumerate(arr2):
if i % 2 == 0:
param_value_dict[str] = arr2[i + 1]
# now if you want to chnage the value of search from 'aaa' to 'bbb', then just change it in the dictonary
param_value_dict['search'] = 'bbb'
# now form the new url from the dictonary
new_url = base_url + '?'
for param_name, param_value in param_value_dict.items():
new_url = new_url + param_name + "=" + param_value + "&"
# remove the extra '&'
new_url = new_url[:len(new_url) - 1]
print(new_url)
How about:
ext = "bbb"
a = "http://januapp.com/demo/search.php?search="
print a+ext
Where ext is what you want to search for, a is the link and just add them together.
Or you could replace values like this:
ext = "bbb"
a = "http://januapp.com/demo/search.php?search=aaa"
print a.replace('aaa', ext)
Using regex:
import re
ext = "bbb"
a = "http://januapp.com/demo/search.php?search=aaa"
b=re.compile(r".+search=")
print re.search(b,a).group()+ext

Shoving Scrapy Objects into array when parsing; what am I doing wrong?

I've basically created a spider that follows a set of links acquired from an API, and then extracts text from the HTML body. I'm trying to append returned items to appropriate lists, which are then added to a dictionary. When I run the code, the resultant JSON file only successfully writes the first line.
I am running Python 3.6 in a virtual environment on a Windows 10 64-bit machine, and I run pip-upgrade daily.
from nltk.corpus import stopwords
import smtplib
from time import sleep # To prevent overwhelming the server between connections
from bs4 import BeautifulSoup as soup
import scrapy
import mysql.connector as mariadb
import sys
from collections import Counter
from pprint import pprint
import json
import re
conn = mariadb.connect(user=dbuser, password=dbpassword, database=dbdatabase)
c = conn.cursor()
e = sys.exc_info()[0]
c.execute("Select URL FROM [TABLE]")
JobURLs = c.fetchall()
for object in JobURLs:
urls = []
url_string = str(object)
rx = re.compile('\W\W\W$')
res = rx.sub('', url_string)
rx = re.compile('^\W\W')
url = rx.sub('', res)
urls.append(url)
c.execute("Select JvId FROM [TABLE]")
JobIDs = c.fetchall()
for object in JobIDs:
item = {}
item['JvId'] = []
JobID_string = str(object)
rx = re.compile('\W\W\W$')
res = rx.sub('', JobID_string)
rx = re.compile('^\W\W')
JobID = rx.sub('', res)
item['JvId'].append(JobID)
class JobListing(scrapy.Spider):
name = 'JobListingCrawler'
start_urls = urls
def parse(self, response):
# pass
item['urlText'] = response.url
page_html = response.body
page_soup = soup(page_html, 'lxml')
for script in page_soup(['script', 'style']):
script.extract()
item['jobDescText'] = page_soup.get_text('''\n''', strip=True)
## TextCleaner Function for Word Counter
text = item['jobDescText'].replace('\n', ' ')
lines = [line.strip() for line in text.splitlines()]
chunks = [phrase.strip() for line in lines for phrase in line.split(' ')]
def chunk_space(chunk):
chunk_out = chunk + ' '
return chunk_out
text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8')
try:
text = text.decode('unicode_escape').encode('ascii', 'ignore')
except:
print(e)
pass
text = re.sub('[^a-zA-Z,+3]', ' ', str(text))
text = text.lower().split()
stop_words = set(stopwords.words('english'))
text = [word for word in text if not word in stop_words]
wordCounter = Counter(text)
item['wordCounter'] = str(wordCounter)
## And now we parse for email addresses!
prog = re.compile(r"[A-z0-9._%+-]+#[A-z0-9.-]+\.[A-z]{2,}")
found = prog.search(item['jobDescText'].replace('\n', ' '))
try:
item['email'] = str(found.group(0))
except:
item['email'] = 'null'
pass
filename = 'results.jl'
line = json.dumps(dict(item)) + '\n'
with open(filename, 'a') as f:
f.write(line)
self.log('Saved Line to %s' % filename)
You just need to declare a Scrapy Item, which contains yours returned fields definion.
After that, just need to config your setting file to allow Scrapy Feed Exports using the built-in JsonItemExporter for your extract data:
FEED_URI: file:///tmp/export.json
FEED_FORMAT: json
So silly me: I put the list variable within the For Loop, so each time the actions looped it would delete the previously written values. Moving them outside of the loop solved the problem.
c.execute("Select URL FROM CareerOneStopJobs")
JobURLs = c.fetchall()
urls = []
for element in JobURLs:
url_string = str(element)
rx = re.compile('\W\W\W$')
res = rx.sub('', url_string)
rx = re.compile('^\W\W')
url = rx.sub('', res)
urls.append(url)
c.execute("Select JvId FROM CareerOneStopJobs")
JobIDs = c.fetchall()
item = {}
for JobID in JobIDs:
item['JvId'] = []
JobID_string = str(JobID)
rx = re.compile('\W\W\W$')
res = rx.sub('', JobID_string)
rx = re.compile('^\W\W')
JobID = rx.sub('', res)
item['JvId'] = JobID

How to use Python requests and looping to write Json files

I have writen some python code o help me pull data from an API. The first version of my program work quite well.
N0w i am trying to develop a more DRY version of the code by introducing functions and loops . I am still new to python.
Your proffesional advice will be really apreciated
import requests
import json
# Bika Lims Authentication details
username = 'musendamea'
password = '!Am#2010#bgl;'
# API url Calls for patients, analysis and cases
patient_url = "adress1"
analysis_url = "adress2"
cases_url = "adress3"
# peform API calls and parse json data
patient_data = requests.get(patient_url, auth=(username, password ))
analysis_data = requests.get(analysis_url, auth=(username, password ))
cases_data = requests.get(cases_url, auth=(username, password ))
patients = json.loads(patient_data.text)
analysis = json.loads(analysis_data.text)
cases = json.loads(cases_data.text)
# checks for errors if any
print ("Patients")
print (patients['error'])
print (patients['success'])
print (patients['last_object_nr'])
print (patients['total_objects'])
print ("\n Analysis")
print (analysis['error'])
print (analysis['success'])
print (analysis['last_object_nr'])
print (analysis['total_objects'])
print ("\n Cases")
print (cases['error'])
print (cases['success'])
print (cases['last_object_nr'])
print (cases['total_objects'])
# create and save json files for patients, analysis and cases
with open('patients.json', 'w') as outfile:
json.dump(patients['objects'], outfile)
with open('analysis.json', 'w') as outfile1:
json.dump(analysis['objects'], outfile1)
with open('cases.json', 'w') as outfile2:
json.dump(cases['objects'], outfile2)
The Above code works pretty well but my challenge is making the code DRY. somehow the loop breaks when i change the following section
your_domain = "10.0.0.191"
data_types = ['patients', 'analysis', 'cases']
checkers = ['error', 'success', 'total_objects']
urls = []
data_from_api = []
# API url Call
base_url = "http://" + your_domain + "/##API/read?"
page_size = "1000000000000000000"
patient_url = base_url + "catalog_name=bika_patient_catalog&page_size="
+ page_size
analysis_url = base_url + "portal_type=AnalysisRequest&
review_state=published&page_size=" + page_size
cases_url = base_url + "portal_type=Batch&page_size=" + page_size
urls.append(patient_url)
urls.append(analysis_url)
urls.append(cases_url)
# peform API calls and parse json data
def BikaApiCalls(urls, username, password):
for i in len(urls):
data_ = requests.get(urls[i - 1], auth = (username, password))
print (data_types[i] + " ~ status_code: ")
print (data_.status_code + "\n")
data_from_api.append(json.loads(data_.text))
for val in len(checkers):
print (data_from_api[i][val])
BikaApiCalls(urls, username, password)
# Write JSON files
def WriteJson(data_types, data_from_api):
for i in len(data_from_api):
with open(data_types[i] + '.json', 'w') as outfile:
json.dump(data_from_api[i]['objects'], outfile)
WriteJson(data_types, data_from_api)
Where am I getting it wrong. I tried some debugging but i ca seen to get through. Id really appreciate your help.
Thanks in advance :)

ScraperWiki/Python: filtering out records when property is false

I'm using the following code on ScraperWiki to search Twitter for a specific hashtag.
It's working great and is picking out any postcode provided in the tweet (or returning false if none is available). This is achieved with the line data['location'] = scraperwiki.geo.extract_gb_postcode(result['text']).
But I'm only interested in tweets which include postcode information (this is because they're going to be added to a Google Map at a later stage).
What would be the easiest way to do this? I'm relatively au fait with PHP, but Python's a completely new area for me.
Thanks in advance for your help.
Best wishes,
Martin
import scraperwiki
import simplejson
import urllib2
QUERY = 'enter_hashtag_here'
RESULTS_PER_PAGE = '100'
NUM_PAGES = 10
for page in range(1, NUM_PAGES+1):
base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&page=%s' \
% (urllib2.quote(QUERY), RESULTS_PER_PAGE, page)
try:
results_json = simplejson.loads(scraperwiki.scrape(base_url))
for result in results_json['results']:
#print result
data = {}
data['id'] = result['id']
data['text'] = result['text']
data['location'] = scraperwiki.geo.extract_gb_postcode(result['text'])
data['from_user'] = result['from_user']
data['created_at'] = result['created_at']
print data['from_user'], data['text']
scraperwiki.sqlite.save(["id"], data)
except:
print 'Oh dear, failed to scrape %s' % base_url
break
Do you just want this? I tried on the free ScraperWiki test page and seems to do what you want. If you're looking for something more complicated, let me know.
import scraperwiki
import simplejson
import urllib2
QUERY = 'meetup'
RESULTS_PER_PAGE = '100'
NUM_PAGES = 10
for page in range(1, NUM_PAGES+1):
base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&page=%s' \
% (urllib2.quote(QUERY), RESULTS_PER_PAGE, page)
try:
results_json = simplejson.loads(scraperwiki.scrape(base_url))
for result in results_json['results']:
#print result
data = {}
data['id'] = result['id']
data['text'] = result['text']
data['location'] = scraperwiki.geo.extract_gb_postcode(result['text'])
data['from_user'] = result['from_user']
data['created_at'] = result['created_at']
if data['location']:
print data['location'], data['from_user']
scraperwiki.sqlite.save(["id"], data)
except:
print 'Oh dear, failed to scrape %s' % base_url
break
Outputs:
P93JX VSDC
FV36RL Bootstrappers
Ci76fP Eli_Regalado
UN56fn JasonPalmer1971
iQ3H6zR GNOTP
Qr04eB fcnewtech
sE79dW melindaveee
ud08GT MariaPanlilio
c9B8EE akibantech
ay26th Thepinkleash
I've refined it a bit so it's a bit picker than the scraperwiki check for extracting gb postcodes, which lets though quite a few false positives. Basically I took the accepted answer from here, and added some negative lookbehind/lookahead to filter out a few more. It looks like the scraper wiki check does the regex without the negative lookbehind/lookahead. Hope that helps a bit.
import scraperwiki
import simplejson
import urllib2
import re
QUERY = 'sw4'
RESULTS_PER_PAGE = '100'
NUM_PAGES = 10
postcode_match = re.compile('(?<![0-9A-Z])([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {0,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)(?![0-9A-Z])', re.I)
for page in range(1, NUM_PAGES+1):
base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&page=%s' \
% (urllib2.quote(QUERY), RESULTS_PER_PAGE, page)
try:
results_json = simplejson.loads(scraperwiki.scrape(base_url))
for result in results_json['results']:
#print result
data = {}
data['id'] = result['id']
data['text'] = result['text']
data['location'] = scraperwiki.geo.extract_gb_postcode(result['text'])
data['from_user'] = result['from_user']
data['created_at'] = result['created_at']
if data['location'] and postcode_match.search(data['text']):
print data['location'], data['text']
scraperwiki.sqlite.save(["id"], data)
except:
print 'Oh dear, failed to scrape %s' % base_url
break

Categories