Eliminate unwanted characters from JSON file using different threads (Python) - python

In my python file, I have created a class called Download. The code where the class is:
import requests, json, os, pytube, threading
class Download:
def __init__(self, url, json=False, get=False, post=False, put=False, unwanted="", wanted="", unwanted2="", wanted2="", unwanted3="", wanted3=""):
self.url = url
self.json = json
self.get = get
self.post = post
self.put = put
self.unwanted = unwanted
self.wanted = wanted
self.unwanted2 = unwanted2
self.wanted2 = wanted2
self.unwanted3 = unwanted3
self.wanted3 = wanted3
def downloadJson(self):
if self.get is True:
downloadJson = requests.get(self.url)
downloadJson = str(downloadJson.content)
downloadJsonS = str(downloadJson) # This saves the downloaded JSON file as string
if self.json is True:
with open("downloadedJson.json", "w") as writeDownloadedJson:
with open("downloadedJson.json", "r") as replaceUnwanted:
a = replaceUnwanted.read()
x = a.replace(self.unwanted, self.wanted)
# y = a.replace(self.unwanted2, self.wanted2)
# z = a.replace(self.unwanted3, self.wanted3)
with open("downloadedJson.json", "w") as writeUnwanted:
# writeUnwanted.write(y)
# writeUnwanted.write(z)
# with open("downloadedJson.json", "w")as j:
# j.write(downloadJsonS)
# j.close()
I have written all this by myself, and I understand how it works. My objective is to remove all the unwanted characters that come in the JSON file once downloaded, such as: \\n, \' or \n. I have many arguments in the __init__() function, like the __init__(unwanted="", wanted="", unwanted2="") etcetera.
By this, when adding any character to the unwanted parameter, such as: \\n, it should replace all these characters by a space. This is done properly, and it works. The lines of code that are comments are the lines of code that I was using, but that did not work. It would only replace the characters from only 1 argument.
Is there any way of passing all the unwanted characters in each for each argument, using threads. If it is not possible using threads, is there any alternative?
By the way, the file where I am executing the class: (main.py):
from downloader import Download
with open("url.txt", "r")as url:
x = Download(url.read(), get=True, json=True, unwanted="\\n")

You could apply the replacements one after another:
x = a.replace(self.unwanted, self.wanted)
x = x.replace(self.unwanted2, self.wanted2)
x = x.replace(self.unwanted3, self.wanted3)
You could also chain the replacement together, but that would quickly become hard to read:
x = a.replace(...).replace(...).replace(...)
Btw, instead of having multiple unwantedN and wantedN,
it would be probably a lot easier to use a list of (unwanted, wanted) pairs, something like this:
def __init__(self, url, json=False, get=False, post=False, put=False, replacements=[]):
self.url = url
self.json = json
self.get = get
self.post = post
self.put = put
self.replacements = replacements
And then you could perform the replacements in a loop:
x = a
for unwanted, wanted in self.replacements:
x = x.replace(unwanted, wanted)


TypeError: list indices must be integers or slices, not Tag. One of my loops isn't working

The ultimate goal of this is to output select data columns to a .csv. I had it working once to where it only got the first table on the page but I needed both. Now it says this. Im quite new to python and IDK how I got to this point in the first place. I needed the call and put table but on the web page the calls came first and when I did .find I only got the calls. I am working on this with a friend and he put in the last two functions. He could get the columns I wanted but now we only get the calls. I tried to fix it and now it say the error in the title.
import bs4
import requests
import pandas as pd
import csv
from bs4 import BeautifulSoup
#sets desired ticker. in the future you could make this long
def ticker():
ticker = ['GME','NYMT']
return ticker
#creates list of urls for scrapet to grab
def ticker_site():
ticker_site = ['https://finance.yahoo.com/quote/'+x+'/options?p='+x for x in ticker()]
return ticker_site
optionRows = []
for i in range(len(ticker_site())):
def ticker_gets():
option_page = ticker_site()
requested_page = requests.get(option_page[i])
ticker_soup = BeautifulSoup(requested_page.text,'html.parser')
return ticker_soup
def soup_search():
table = ticker_gets()
both_tables = table.find_all('table')
call_table = both_tables[0]
put_table= both_tables[1]
call_rows = call_table.find('tr')
put_rows = put_table.find('tr')
#makes the call table
for call in call_rows:
whole_call_table = call.find_all('td')
call_row = [y.text for y in whole_call_table]
#makes the put table
for put in put_rows:
whole_put_table = put.find_all('td')
put_row = [z.text for z in whole_put_table]
for i in range(len(optionRows)):
optionRows[i] = optionRows[i][1:len(optionRows[i])]
return optionRows
def getColumns(columnIndexes=[2, 4, 5]):
newList = []
for tickerIndex in range(len(soup_search())):
indexCount = 0
for j in soup_search()[tickerIndex]:
for i in columnIndexes:
indexCount += 1
return newList
def csvOutputer():
rows = getColumns()
fields = ["Ticker", "Strike", "Bid", "Ask"]
with open('newcsv', 'w') as f:
write = csv.writer(f)
for i in range(len(ticker())):
for j in rows[i]:
j.insert(0, ticker()[i])

text substitution {} does not work at scrapinghub

I create a url with {} format to change the url on the fly.
It works totally fine on my PC.
But once I upload and run it from scrapinghub one(state) of the many substitutions(others work fine) does not work, it returns %7B%7D& in the url which is encoded curly braces.
Why does this happen? What do I miss when referencing State variable?
This is the url from my code:
def __init__(self):
self.state = 'AL'
self.zip = '35204'
self.tax_rate = 0
self.years = [2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]
def parse_m(self, response):
r = json.loads(response.text)
models = r['models']
year = response.meta['year']
make = response.meta['make']
for model in models:
for milage in [40000,50000,60000,70000,80000,90000,100000]:
url = '****/vehicles/?year={}&make={}&model={}&state={}&mileage={}&zip={}'.format(year,make, model, self.state, milage, self.zip)
and this is the url i see in the log of scrapinghub:
This is not a scrapinghub issue. It has to be your code only. If I do below
>>> "state={}".format({})
This would end up being
I would add
assert type(self.state) is str
to my code to ensure this situation doesn't happen and if it does then you get an AssertionError

class not functioning methods properly

I defined a class to handle blocks of tweets so I could manage them a little easier
class twitter_block(object):
def __init__(self):
self.tweets = []
self.df = pd.DataFrame()
self.tag = ''
def load(self, data):
self.tweets = [x for x in data]
then defined a method as part of a pipeline:
def clean(self):
HTTP_PATTERN = '^https?:\/\/.*[\r\n]*'
AT_PATTERN = '#\w+ ?'
# tke away links
self.tweets = [re.sub(HTTP_PATTERN, '', str(x), flags=re.MULTILINE) for x in self.tweets]
# take away # signs
self.tweets = [re.sub(AT_PATTERN,'',str(x)) for x in self.tweets]
but when I call this:
tweet = load_data('The_Donald.json')
block = twitter_block(tag='donald')
it returns the 1504 tweets that I loaded into the block object same as before, no cleaning links or anything. Although, actually it does remove # signs... but this method,
def smilecheck(self):
#save a tweet if there is a smiley there
smiley_pattern = '^(:\(|:\))+$'
for tweet in self.tweets:
if re.match(smiley_pattern, str(tweet)):
does not remove the tweets without smileys, returns 1504 tweets, the same as I put in... any help guys? im sure this is a problem with the way I am approaching objects
I believe the problem is that you are using re.match() instead of re.search()
Where you want to find the tweets that contain a smiley anywhere in the tweet, re.match() searches only from the beginning of the string.
See python -- re.match vs re.search

context for using `yield` keyword in python

I have the following program to scrap data from a website. I want to improve the below code by using a generator with a yield instead of calling generate_url and call_me multiple times sequentially. The purpose of this exersise is to properly understand yield and the context in which it can be used.
import requests
import shutil
yf_base_url ='http://real-chart.finance.yahoo.com/table.csv?s=%5E'
index_list = ['BSESN','NSEI']
def generate_url(index, start_date, end_date):
s_day = start_date.split('-')[0]
s_month = start_date.split('-')[1]
s_year = start_date.split('-')[2]
e_day = end_date.split('-')[0]
e_month = end_date.split('-')[1]
e_year = end_date.split('-')[2]
if (index == 'BSESN') or (index == 'NSEI'):
url = yf_base_url + index + '&a={}&b={}&c={}&d={}&e={}&f={}'.format(s_day,s_month,s_year,e_day,e_month,e_year)
return url
def callme(url,index):
print('URL {}'.format(url))
r = requests.get(url, verify=False,stream=True)
if r.status_code!=200:
print "Failure!!"
r.raw.decode_content = True
with open(index + "file.csv", 'wb') as f:
shutil.copyfileobj(r.raw, f)
print "Success"
if __name__ == '__main__':
url = generate_url(index_list[0],start_date,end_date)
url = generate_url(index_list[1],start_date,end_date)
There are multiple options. You could use yield to iterate over URL's. Or over request objects.
If your index_list were long, I would suggest yielding URLs.
Because then you could use multiprocessing.Pool to map a function that does a request and saves the output over these URLs. That would execute them in parallel, potentially making it a lot faster (assuming that you have enough network bandwidth, and that yahoo finance doesn't throttle connections).
yf ='http://real-chart.finance.yahoo.com/table.csv?s=%5E'
index_list = ['BSESN','NSEI']
def genurl(symbols, start_date, end_date):
# assemble the URLs
s_day, s_month, s_year = start_date.split('-')
e_day, e_month, e_year = end_date.split('-')
for s in symbols:
url = yf.format(s, s_day,s_month,s_year,e_day,e_month,e_year)
yield url
def download(url):
# Do the request, save the file
p = multiprocessing.Pool()
rv = p.map(download, genurl(index_list, '03-03-1997', '10-04-2015'))
If I understand you correctly, what you want to know is how to change the code so that you can replace the last part by
if __name__ == '__main__':
for url in generate_url(index_list,start_date,end_date):
If this is correct, you need to change generate_url, but not callme. Changing generate_url is rather mechanical. Make the first parameter index_list instead of index, wrap the function body in a for index in index_list loop, and change return url to yield url.
You don't need to change callme because you never want to say something like for call in callme(...). You won't do anything with it but a normal function call.

How do I instantiate a group of objects from a text file?

I have some log files that look like many lines of the following:
<tickPrice tickerId=0, field=2, price=201.81, canAutoExecute=1>
<tickSize tickerId=0, field=3, size=25>
<tickSize tickerId=0, field=8, size=534349>
<tickPrice tickerId=0, field=2, price=201.82, canAutoExecute=1>
I need to define a class of type tickPrice or tickSize. I will need to decide which to use before doing the definition.
What would be the Pythonic way to grab these values? In other words, I need an effective way to reverse str() on a class.
The classes are already defined and just contain the presented variables, e.g., tickPrice.tickerId. I'm trying to find a way to extract these values from the text and set the instance attributes to match.
Edit: Answer
This is what I ended up doing-
with open(commandLineOptions.simulationFilename, "r") as simulationFileHandle:
for simulationFileLine in simulationFileHandle:
(date, time, msgString) = simulationFileLine.split("\t")
if ("tickPrice" in msgString):
msgStringCleaned = msgString.translate(None, ''.join("<>,"))
msgList = msgStringCleaned.split(" ")
msg = message.tickPrice()
msg.tickerId = int(msgList[1][9:])
msg.field = int(msgList[2][6:])
msg.price = float(msgList[3][6:])
msg.canAutoExecute = int(msgList[4][15:])
elif ("tickSize" in msgString):
msgStringCleaned = msgString.translate(None, ''.join("<>,"))
msgList = msgStringCleaned.split(" ")
msg = message.tickSize()
msg.tickerId = int(msgList[1][9:])
msg.field = int(msgList[2][6:])
msg.size = int(msgList[3][5:])
print "Unsupported tick message type"
I'm not sure how you want to dynamically create objects in your namespace, but the following will at least dynamically create objects based on your loglines:
Take your line:
line = '<tickPrice tickerId=0, field=2, price=201.81, canAutoExecute=1>'
Remove chars that aren't interesting to us, then split the line into a list:
line = line.translate(None, ''.join('<>,'))
line = line.split(' ')
Name the potential class attributes for convenience:
line_attrs = line[1:]
Then create your object (name, base tuple, dictionary of attrs):
tickPriceObject = type(line[0], (object,), { key:value for key,value in [at.split('=') for at in line_attrs]})()
Prove it works as we'd expect:
# 2
Approaching the problem with regex, but with the same result as tristan's excellent answer (and stealing his use of the type constructor that I will never be able to remember)
import re
class_instance_re = re.compile(r"""
<(?P<classname>\w[a-zA-Z0-9]*)[ ]
(?:\w[a-zA-Z0-9]*=[0-9.]+[, ]*)+
)>""", re.X)
objects = []
for line in whatever_file:
result = class_instance_re.match(line)
classname = line.group('classname')
arguments = line.group('arguments')
new_obj = type(classname, (object,),
dict([s.split('=') for s in arguments.split(', ')]))
