How to replace url parameter using python - python

import urllib.parse as urlparse
url = "http://www.example.com?type=aaaaaaa&type1=bbbbbbb&type2=cccccccc"
trigger = ["value1","value2","value3"]
parsed = urlparse.urlparse(url)
querys = parsed.query.split("&")
result = []
for pairs in trigger:
new_query = "&".join([ "{}{}".format(query, pairs) for query in querys])
parsed = parsed._replace(query=new_query)
result.append(urlparse.urlunparse(parsed))
print(result)
How to return a list of URLs by replacing the query parameter values?
Output Result :
["http://www.example.com?type=aaaaavalue1&type1=bbbbbbvalue1&type2=ccccccccvalue1", "http://www.example.com?type=aaaaavalue2&type1=bbbbbbvalue2&type2=ccccccccvalue2", "http://www.example.com?type=aaaaavalue3&type1=bbbbbbvalue3&type2=ccccccccvalue3"]
Expected Result:
["http://www.example.com?type=value1&type1=value1&type2=value1", "http://www.example.com?type=value2&type1=value2&type2=value2", "http://www.example.com?type=value3&type1=value3&type2=value3"]
I just want to replace URL parameter values with the custom parameter values and do not want to append them.

You can use the function replace
url = "http://www.example.com?type=aaaaaaa&type1=bbbbbbb&type2=cccccccc"
trigger = []
for i in range(url.count("=")):
trigger.append("value{}".format(i+1))
urls = []
start = [pos for pos, char in enumerate(url) if char == "="]
end = [pos for pos, char in enumerate(url) if char == "&"]
end.append(len(url))
for i in range(len(trigger)):
urls.append(url.replace(url[start[0]+1:end[0]],trigger[i]).replace(url[start[1]+1:end[1]],trigger[i]).replace(url[start[2]+1:end[2]],trigger[i]))
>>> urls
['http://www.example.com?type=value1&type1=value1&type2=value1',
'http://www.example.com?type=value2&type1=value2&type2=value2',
'http://www.example.com?type=value3&type1=value3&type2=value3']

Related

Data output is not the same as inside function

I am currently having an issue where I am trying to store data in a list (using dataclasses). When I print the data inside the list in the function (PullIncursionData()) it responded with a certain amount of numbers (never the same, not possible due to it's nature). When printing it after it being called to store it's return in a Var it somehow prints only the same number.
I cannot share the numbers, as they update with EVE Online's API, so the only way is to run it locally and read the first list yourself.
The repository is Here: https://github.com/AtherActive/EVEAPI-Demo
Heads up! Inside the main.py (the file with issues) (a snippet of code is down below) are more functions. All functions from line 90 and forward are important, the rest can be ignored for this question, as they do not interact with the other functions.
def PullIncursionData():
#Pulls data from URL and converts it into JSON
url = 'https://esi.evetech.net/latest/incursions/?datasource=tranquility'
data = rq.get(url)
jsData = data.json()
#Init var to store incursions
incursions = []
#Set lenght for loop. yay
length = len(jsData)
# Every loop incursion data will be read by __parseIncursionData(). It then gets added to var Incursions.
for i in range(length):
# Add data to var Incursion.
incursions.append(__parseIncursionData(jsData, i))
# If Dev mode, print some debug. Can be toggled in settings.py
if settings.developerMode == 1:
print(incursions[i].constellation_id)
return incursions
# Basically parses the input data in a decent manner. No comments needed really.
def __parseIncursionData(jsData, i):
icstruct = stru.Incursion
icstruct.constellation_id = jsData[i]['constellation_id']
icstruct.constellation_name = 'none'
icstruct.staging = jsData[i]['staging_solar_system_id']
icstruct.region_name = ResolveSystemNames(icstruct.constellation_id, 'con-reg')
icstruct.status = jsData[i]['state']
icstruct.systems_id = jsData[i]['infested_solar_systems']
icstruct.systems_names = ResolveSystemNames(jsData[i]['infested_solar_systems'], 'system')
return icstruct
# Resolves names for systems, regions and constellations. Still WIP.
def ResolveSystemNames(id, mode='constellation'):
#init value
output_name = 'none'
# If constellation, pull data and find region name.
if mode == 'con-reg':
url = 'https://www.fuzzwork.co.uk/api/mapdata.php?constellationid={}&format=json'.format(id)
data = rq.get(url)
jsData = data.json()
output_name = jsData[0]['regionname']
# Pulls system name form Fuzzwork.co.uk.
elif mode == 'system':
#Convert output to a list.
output_name = []
lenght = len(id)
# Pulls system name from Fuzzwork. Not that hard.
for i in range(lenght):
url = 'https://www.fuzzwork.co.uk/api/mapdata.php?solarsystemid={}&format=json'.format(id[i])
data = rq.get(url)
jsData = data.json()
output_name.append(jsData[i]['solarsystemname'])
return output_name
icdata = PullIncursionData()
print('external data check:')
length = len(icdata)
for i in range(length):
print(icdata[i].constellation_id)
structures.py (custom file)
#dataclass
class Incursion:
constellation_id = int
constellation_name = str
staging = int
staging_name = str
systems_id = list
systems_names = list
region_name = str
status = str
def ___init___(self):
self.constellation_id = -1
self.constellation_name = 'undefined'
self.staging = -1
self.staging_name = 'undefined'
self.systems_id = []
self.systems_names = []
self.region_name = 'undefined'
self.status = 'unknown'

Use regex to match 3 characters in string

I have a json payload that I need to match just the SDC in the vdcLocation.
{
"cmdbID":"d01aacda21b7c181aaaaa16dc4bcbca",
"serialNumber":"VBlock740-4239340361f4d-0f6d9d6ad46879",
"vdcLocation":"Data Center-San Diego (SDC)"
}
Here's the code I have so far, what am I missing?
import json
with open('test-payload.json') as json_file:
data = json.load(json_file)
serialNumber = data["serialNumber"]
dataCenter = data["vdcLocation"]
splittedSerialNumber = serialNumber.split("-") # returns splitted list
firstPart = splittedSerialNumber[0] # accessing the first part of the splitted list
splittedDataCenter = dataCenter.split("-")
lastPart = splittedDataCenter[1]
vdcLocationOnly = if (re.match^('[SDC]')$):
print(vdcLocationOnly)
print(serialNumber)
print(splittedSerialNumber)
print(firstPart)
print(splittedDataCenter)
print(lastPart)
One solution would be something like the following:
import json
import re
with open('test-payload.json') as json_file:
data = json.load(json_file)
serialNumber = data["serialNumber"]
dataCenter = data["vdcLocation"]
splittedSerialNumber = serialNumber.split("-") # returns splitted list
firstPart = splittedSerialNumber[0] # accessing the first part of the splitted list
splittedDataCenter = dataCenter.split("-")
lastPart = splittedDataCenter[1]
if "SDC" in dataCenter:
print("found SDC using in")
if re.search(r'\(SDC\)$', dataCenter):
print("found SDC using re")
print(serialNumber)
print(splittedSerialNumber)
print(firstPart)
print(splittedDataCenter)
print(lastPart)
The simplest approach would be to use "SDC" in dataCenter. But if your needs are a bit more complicated and you indeed need to use a regular expression then you probably want to use re.search (see the docs).

How to scrape entire integer in python with Beautiful Soup?

Working on getting some wave heights from websites and my code fails when the wave heights get into the double digit range.
Ex: Currently the code would scrape a 12 from the site as '1' and '2' separately, not '12'.
#Author: David Owens
#File name: soupScraper.py
#Description: html scraper that takes surf reports from various websites
import csv
import requests
from bs4 import BeautifulSoup
NUM_SITES = 2
reportsFinal = []
###################### SURFLINE URL STRINGS AND TAG ###########################
slRootUrl = 'http://www.surfline.com/surf-report/'
slSunsetCliffs = 'sunset-cliffs-southern-california_4254/'
slScrippsUrl = 'scripps-southern-california_4246/'
slBlacksUrl = 'blacks-southern-california_4245/'
slCardiffUrl = 'cardiff-southern-california_4786/'
slTagText = 'observed-wave-range'
slTag = 'id'
#list of surfline URL endings
slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl]
###############################################################################
#################### MAGICSEAWEED URL STRINGS AND TAG #########################
msRootUrl = 'http://magicseaweed.com/'
msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/'
msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/'
msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/'
msTagText = 'rating-text'
msTag = 'li'
#list of magicseaweed URL endings
msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl]
###############################################################################
'''
This class represents a surf break. It contains all wave, wind, & tide data
associated with that break relevant to the website
'''
class surfBreak:
def __init__(self, name,low, high, wind, tide):
self.name = name
self.low = low
self.high = high
self.wind = wind
self.tide = tide
#toString method
def __str__(self):
return '{0}: Wave height: {1}-{2} Wind: {3} Tide: {4}'.format(self.name,
self.low, self.high, self.wind, self.tide)
#END CLASS
'''
This returns the proper attribute from the surf report sites
'''
def reportTagFilter(tag):
return (tag.has_attr('class') and 'rating-text' in tag['class']) \
or (tag.has_attr('id') and tag['id'] == 'observed-wave-range')
#END METHOD
'''
This method checks if the parameter is of type int
'''
def representsInt(s):
try:
int(s)
return True
except ValueError:
return False
#END METHOD
'''
This method extracts all ints from a list of reports
reports: The list of surf reports from a single website
returns: reportNums - A list of ints of the wave heights
'''
def extractInts(reports):
print reports
reportNums = []
afterDash = False
num = 0
tens = 0
ones = 0
#extract all ints from the reports and ditch the rest
for report in reports:
for char in report:
if representsInt(char) == True:
num = int(char)
reportNums.append(num)
else:
afterDash = True
return reportNums
#END METHOD
'''
This method iterates through a list of urls and extracts the surf report from
the webpage dependent upon its tag location
rootUrl: The root url of each surf website
urlList: A list of specific urls to be appended to the root url for each
break
tag: the html tag where the actual report lives on the page
returns: a list of strings of each breaks surf report
'''
def extractReports(rootUrl, urlList, tag, tagText):
#empty list to hold reports
reports = []
reportNums = []
index = 0
#loop thru URLs
for url in urlList:
try:
index += 1
#request page
request = requests.get(rootUrl + url)
#turn into soup
soup = BeautifulSoup(request.content, 'lxml')
#get the tag where surflines report lives
reportTag = soup.findAll(reportTagFilter)[0]
reports.append(reportTag.text.strip())
#notify if fail
except:
print 'scrape failure at URL ', index
pass
reportNums = extractInts(reports)
return reportNums
#END METHOD
'''
This method calculates the average of the wave heights
'''
def calcAverages(reportList):
#empty list to hold averages
finalAverages = []
listIndex = 0
waveIndex = 0
#loop thru list of reports to calc each breaks ave low and high
for x in range(0, 6):
#get low ave
average = (reportList[listIndex][waveIndex]
+ reportList[listIndex+1][waveIndex]) / NUM_SITES
finalAverages.append(average)
waveIndex += 1
return finalAverages
#END METHOD
slReports = extractReports(slRootUrl, slUrls, slTag, slTagText)
msReports = extractReports(msRootUrl, msUrls, msTag, msTagText)
reportsFinal.append(slReports)
reportsFinal.append(msReports)
print 'Surfline: ', slReports
print 'Magicseaweed: ', msReports
You are not actually extracting integers, but floats, it seems, since the values in reports are something like ['0.3-0.6 m']. Right now you are just going through every single character and converting them to int one by one or discarding. So no wonder that you will get only single-digit numbers.
One (arguably) simple way to extract those numbers from that string is with regexp:
import re
FLOATEXPR = re.compile("(\d+\.\d)-(\d+\.\d) {0,1}m")
def extractFloats(reports):
reportNums = []
for report in reports:
groups = re.match(FLOATEXPR, report).groups()
for group in groups:
reportNums.append(float(group))
return reportNums
This expression would match your floats and return them as a list.
In detail, the expression will match anything that has at least one digit before a '.', and one digit after it, a '-' between, another float sequence and ending with 'm' or ' m'. Then it groups the parts representing floats to a tuple. For example that ['12.0m-3.0m'] would return [12.0, 3.0]. If you expect it to have more digits after the floating point, you can add an extra '+' after the second 'd':s in the expression.

SyntaxError: can't assign to operator

I have written the following function to construct a URL query from a base URL.
start_date='03-03-1997'
end_date='10-04-2015'
yf_base_url ='http://real-chart.finance.yahoo.com/table.csv?s=%5E'
index_list = ['BSESN','NSEI']
url = "http://real-chart.finance.yahoo.com/table.csv?s=%5E{}&a=03&b=3&c=1997&d=10&e=4&f=2015&g=d&ignore=.csv".format('BSESN')
def generate_url(index, start_date, end_date):
if (index == 'BSESN') or (index == 'NSEI'):
s_day = start_date.split('-')[0]
s_month = start_date.split('-')[1]
s_year = start_date.split('-')[2]
e_day = end_date.split('-')[0]
e_month = end_date.split('-')[1]
e_year = end_date.split('-')[2]
print('{} {} {} {} {} {}'.format(s_day,s_month,s_year,e_day,e_month,e_year))
url = (yf_base_url.join(index))&a=s_day&b=s_month&c=s_year&d=e_day&e=e_month&f=e_year
return url
I get the following error.
File "get_data.py", line 21
url = (yf_base_url.join(index))&a=s_day&b=s_month&c=s_year&d=e_day&e=e_month&f=e_year
SyntaxError: can't assign to operator
I am trying to figure out why this can't be done.
This line isn't valid Python syntax:
url = (yf_base_url.join(index))&a=s_day&b=s_month&c=s_year&d=e_day&e=e_month&f=e_year
Did you mean to format your string using the .format function and construct a url that way? You'd do that like this:
url = (yf_base_url.join(index)) + "&a={}&b={}&c={}&d={}&e={}&f={}".format(s_day, s_month, s_year, e_day, e_month, e_year)

Reverse regular expression in Python

this is a strange question I know... I have a regular expression like:
rex = r"at (?P<hour>[0-2][0-9]) send email to (?P<name>\w*):? (?P<message>.+)"
so if I match that like this:
match = re.match(rex, "at 10 send email to bob: hi bob!")
match.groupdict() gives me this dict:
{"hour": "10", "name": "bob", "message": "hi bob!"}
My question is: given the dict above and rex, can I make a function that returns the original text? I know that many texts can match to the same dict (in this case the ':' after the name is optional) but I want one of the infinite texts that will match to the dict in input.
Using inverse_regex:
"""
http://www.mail-archive.com/python-list#python.org/msg125198.html
"""
import itertools as IT
import sre_constants as sc
import sre_parse
import string
# Generate strings that match a given regex
category_chars = {
sc.CATEGORY_DIGIT : string.digits,
sc.CATEGORY_SPACE : string.whitespace,
sc.CATEGORY_WORD : string.digits + string.letters + '_'
}
def unique_extend(res_list, list):
for item in list:
if item not in res_list:
res_list.append(item)
def handle_any(val):
"""
This is different from normal regexp matching. It only matches
printable ASCII characters.
"""
return string.printable
def handle_branch((tok, val)):
all_opts = []
for toks in val:
opts = permute_toks(toks)
unique_extend(all_opts, opts)
return all_opts
def handle_category(val):
return list(category_chars[val])
def handle_in(val):
out = []
for tok, val in val:
out += handle_tok(tok, val)
return out
def handle_literal(val):
return [chr(val)]
def handle_max_repeat((min, max, val)):
"""
Handle a repeat token such as {x,y} or ?.
"""
subtok, subval = val[0]
if max > 5000:
# max is the number of cartesian join operations needed to be
# carried out. More than 5000 consumes way to much memory.
# raise ValueError("To many repetitions requested (%d)" % max)
max = 5000
optlist = handle_tok(subtok, subval)
iterlist = []
for x in range(min, max + 1):
joined = IT.product(*[optlist]*x)
iterlist.append(joined)
return (''.join(it) for it in IT.chain(*iterlist))
def handle_range(val):
lo, hi = val
return (chr(x) for x in range(lo, hi + 1))
def handle_subpattern(val):
return list(permute_toks(val[1]))
def handle_tok(tok, val):
"""
Returns a list of strings of possible permutations for this regexp
token.
"""
handlers = {
sc.ANY : handle_any,
sc.BRANCH : handle_branch,
sc.CATEGORY : handle_category,
sc.LITERAL : handle_literal,
sc.IN : handle_in,
sc.MAX_REPEAT : handle_max_repeat,
sc.RANGE : handle_range,
sc.SUBPATTERN : handle_subpattern}
try:
return handlers[tok](val)
except KeyError, e:
fmt = "Unsupported regular expression construct: %s"
raise ValueError(fmt % tok)
def permute_toks(toks):
"""
Returns a generator of strings of possible permutations for this
regexp token list.
"""
lists = [handle_tok(tok, val) for tok, val in toks]
return (''.join(it) for it in IT.product(*lists))
########## PUBLIC API ####################
def ipermute(p):
return permute_toks(sre_parse.parse(p))
You could apply the substitutions given rex and data, and then use inverse_regex.ipermute to generate strings that match the original regex:
import re
import itertools as IT
import inverse_regex as ire
rex = r"(?:at (?P<hour>[0-2][0-9])|today) send email to (?P<name>\w*):? (?P<message>.+)"
match = re.match(rex, "at 10 send email to bob: hi bob!")
data = match.groupdict()
del match
new_regex = re.sub(r'[(][?]P<([^>]+)>[^)]*[)]', lambda m: data.get(m.group(1)), rex)
for s in IT.islice(ire.ipermute(new_regex), 10):
print(s)
yields
today send email to bob hi bob!
today send email to bob: hi bob!
at 10 send email to bob hi bob!
at 10 send email to bob: hi bob!
Note: I modified the original inverse_regex to not raise a ValueError when the regex contains *s. Instead, the * is changed to be effectively like {,5000} so you'll at least get some permutations.
This is one of the texts that will match the regex:
'at {hour} send email to {name}: {message}'.format(**match.groupdict())'

Categories