so here is my code, it creates the files which means it has access to write permissions but it just doesn't write anything inside, the function works and it prints on screen it just doesn't write anything, any idea why is that?
expected behavior: it to create two files and filter websites to working or non-working and state the error code for non-working ones
current behavior: it creates to empty files
import requests
from concurrent.futures import ThreadPoolExecutor
websites = []
f = open("websites.txt", "r")
for i in f:
if not i.startswith("http"):
i = "http://"+i
websites.append(i.strip())
print("appended" + i)
f.close()
with open("working.txt", "w") as fa, open("not_working.txt", "w") as fe:
def checker(website):
response = requests.get(website)
available = response.status_code == 200
print(response.status_code)
if available:
fa.write(website + "\n")
else:
fe.write(website + " error " + response.status_code + "\n")
with ThreadPoolExecutor() as executor:
executor.map(checker, websites)
Code
executor.map(checker, websites)
creates generator but it doesn't execute threads.
It needs at least list()
list( executor.map(checker, websites) )
to execute generator.
Full code:
import requests
from concurrent.futures import ThreadPoolExecutor
# --- functions ---
def read_urls(filename):
websites = []
with open(filename) as f:
for line in f:
line = line.strip()
if line: # skip empty lines
if not line.startswith("http"):
line = "http://" + line
websites.append(line)
print("appended:", line)
return websites
def checker(website):
response = requests.get(website)
print('[checker]:', response.status_code, website)
if response.status_code == 200:
fa.write(f'{website}\n')
else:
fe.write(f'{website} error {response.status_code}\n')
# --- main ---
#websites = read_urls("websites.txt")
websites = ['https://stackoverflow.com', 'https://fake.com']
with open("working.txt", "w") as fa, open("not_working.txt", "w") as fe:
with ThreadPoolExecutor() as executor:
list( executor.map(checker, websites) )
But it may be safer to return result from function and write in main thread. And it will write results in the same order as data in original file. Writing in thread may get it in different order because threads may run in different order.
import requests
from concurrent.futures import ThreadPoolExecutor
# --- functions ---
def read_urls(filename):
websites = []
with open(filename) as f:
for line in f:
line = line.strip()
if line: # skip empty lines
if not line.startswith("http"):
line = "http://" + line
websites.append(line)
print("appended:", line)
return websites
def checker(website):
response = requests.get(website)
print('[checker]:', response.status_code, website)
return website, response.status_code
# --- main ---
#websites = read_urls("websites.txt")
websites = ['https://stackoverflow.com', 'https://fake.com']
with open("working.txt", "w") as fa, open("not_working.txt", "w") as fe:
with ThreadPoolExecutor() as executor:
for website, status_code in executor.map(checker, websites):
print('[main]:', status_code, website)
if status_code == 200:
fa.write(f'{website}\n')
else:
fe.write(f'{website} error {status_code}\n')
Related
I have created a script to download multiple images. I have another file (linkVars.py) in which there are URLs of the images to download. This script import the linkVars.py file then reads one URL at a time, downloads that image from the URL, and writes it into a file named {file_name}.jpg
Below is the code for the explanation of upper lines:
import linksVars as lV # file with urls
def download_url(url):
# Creating a function
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("=") + 1 # naming image by using text in url
name_from_url = url[file_name_start_pos:]
file_name = name_from_url
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
# Opening the image file to write data in it
with open(f'{file_name}.jpg', 'wb') as f:
for data in r:
f.write(data)
Now, I have multiple names written in name_file.txt(external file). As I download the image, I want to name file_name in {file_name}.jpg from one name in name_file.txt. Then as the code starts to download the next file, the next name in name_file.txt should be assigned to {file_name}.jpg If someone could help me then I will be grateful!
Below is the complete code:
import requests
import linksVars as lV
def download_url(url):
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("=") + 1
name_from_url = url[file_name_start_pos:]
file_name = name_from_url
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(f'{file_name}.jpg', 'wb') as f:
for data in r:
f.write(data)
links = lV.List1
try:
for listLinks in links:
download_url(listLinks)
except(KeyboardInterrupt):
print("\n\n===> Script ended by USER! <===")
Try this:
import requests
import linksVars as lV # Importing file with URLs stored in variables
import nameVars as nV # Importing file with names stored in variables
links = lV.List1 # List1 is the list of URLs stored in variables
names = nV.Name1 # Name1 is the list of names stored in variables
# This function will download image from URL and name it from Name1
def download_url(url, names):
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("v=") + 1 # It will find "v=" in given URL and move to next line
name_from_url = url[file_name_start_pos:]
file_name = names
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(f'{file_name}.jpg', 'wb') as f: # Downloaded file will opened and named
for data in r:
f.write(data)
try:
for listLinks, listNames in zip (links, names): # "For loop" will use two arguments
download_url(listLinks, listNames)
except(KeyboardInterrupt):
print("\n\n===> Script ended by USER! <===")
I have a text file called tokens.txt.
Ex: 12463,4126,6343,6345.
And i want to send a post request with each tokens and use multi threading.
For some reasons my code only gets the last token from the txt file and only uses that.
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import time
url_list = [
"https://www.google.com/api/"
]
file_lines = open("tokens.txt", "r").readlines()
for line in file_lines:
tokens = {
'Token':line.replace('/n','')
}
def makerequest(url):
while True:
html = requests.post(url,stream=True, data=tokens)
print(tokens)
return html.content
start = time()
processes = []
with ThreadPoolExecutor(max_workers=200) as executor:
for url in url_list:
processes.append(executor.submit(makerequest, url))
for task in as_completed(processes):
print(task.result())
print(f'Time taken: {time() - start}')
How can i send for each token a request?
In your case tokens = {"Token": <last_token>}
Modify your code like this so that for each token one request can be sent.
tokens = set()
'''
<- You can use list also but in this case set is better as it will ensure only
one request for one token even if your tokens file contains duplicate line.
'''
url_list = [
"https://www.google.com/api/"
]
tokens = set()
with open("tokens.txt", "r") as f:
file_lines = f.readlines()
for line in file_lines:
tokens.add(line.strip())
token_data = {"Token": None}
def makerequest(url):
for token in tokens:
token_data["Token"] = token
html = requests.post(url,stream=True, data=token_data)
print(token)
# do something with html here
# don't return or break
You are doing
data = tokens
and at that point tokens is the assignment from the last line. If you want all tokens, you need to do something likej:
tokens = set()
for line file_lines:
tokens.add(......)
The problem with your code is the creation of the tokens dictionary - you loop ofer the tokens but you alway overwrite the value mapped to the "Token" key.
Moreover there are a few bad practices in you code.
please be careful with the inline opening of files like you did
file_lines = open("tokens.txt", "r").readlines()
Rather use it as a context manager
with open("tokens.txt", "r") as file:
file_lines = file.readlines()
This makes sure that the file gets closed again after you read it - in your case you would need to make sure that the file gets closed (even in a crash etc.)
Secondly, avoid using global variables in functions. According to you code I assume that you want to query the different urls with each token - so the fucntion should accept both as arguments. Respectively i would then create a list of combinations like
url_token_combs = [(url, token.strip()) for url in url_list for token in file_lines]
And finally, change your function to use the arguements handed to it rather than global ones like:
def makerequest(url_token ):
url , token = url_token
html = requests.post(url,stream=True, data=token)
return html.content
That allows you now to loop over your code with thread like:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import time
def makerequest(url_token):
url , token = url_token
html = requests.post(url,stream=True, data=tokens)
print(tokens)
return html.content
if __name__ == "__main__":
start = time()
url_list = [
"https://www.google.com/api/"
]
with open("tokens.txt", "r") as file:
file_lines = file.readlines()
tokens = [{'Token':line.replace('/n','') }for line in file_lines ]
url_tokens = [(url, token.strip()) for url in url_list for token in tokens]
processes = []
with ThreadPoolExecutor(max_workers=200) as executor:
for url_token in url_tokens:
processes.append(executor.submit(makerequest, url_token))
for task in as_completed(processes):
print(task.result())
print(f'Time taken: {time() - start}')
I'm writing a short piece of code in python to check the status code of a list of URLS. The steps are
1. read the URL's from a csv file.
2. Check request code
3. Write the status code request into the csv next to the checked URL
The first two steps I've managed to do but I'm stuck with writing the output of the requests into the same csv, next to the urls. Please help.
import urllib.request
import urllib.error
from multiprocessing import Pool
file = open('innovators.csv', 'r', encoding="ISO-8859-1")
urls = file.readlines()
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
print('200' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=1)
result = p.map(checkurl, urls)
with open('innovators.csv', 'w') as f:
for line in file:
url = ''.join(line)
checkurl(urls + "," + checkurl)
The .readlines() operation leaves the file object at the end of file. When you attempt to loop through the lines of file again, without first rewinding it (file.seek(0)) or closing and opening it again (file.close() followed by opening again), there are no lines remaining. Always recommended to use with open(...) as file construct to ensure file is closed when operation is finished.
Additionally, there appears to be an error in your input to checkurl. You have added a list (urls) to a string (",") to a function (checkurl).
You probably meant for this section to read
with open('innovators.csv', 'w') as f:
for line in urls:
url = ''.join(line.replace('\n','')) # readlines leaves linefeed character at end of line
f.write(url + "," + checkurl(url))
The checkurl function should return what you are intending to place into the csv file. You are simply printing to standard output (screen). Thus, replace your checkurl with
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
ret='0'
except urllib.error.HTTPError as e:
ret='HTTPError: {}'.format(e.code)
except urllib.error.URLError as e:
ret='URLError: {}'.format(e.reason)
else:
ret='200'
return ret
or something equivalent to your needs.
Save the status in a dict. and convert it to dataframe. Then simply send it to a csv file. str(code.getcode()) will return 200 if the url is connecting else it will return an exception, for which i assigned status as '000'. So your csv file will contain url,200 if URL is connecting and url,000 if URL is not connecting.
status_dict={}
for line in lines:
try:
code = urllib.request.urlopen(line)
status = str(code.getcode())
status_dict[line] = status
except:
status = "000"
status_dict[line] = status
df = pd.Dataframe(status_dict)
df.to_csv('filename.csv')
I am querying an API from a website. The API will be down for maintenance from time to time and also, there may not be data available for querying at times. I have written the code to keep forcing the program to query the API even after an error, however it doesn't seem to be working.
The following is the code:
import threading
import json
import urllib
from urllib.parse import urlparse
import httplib2 as http #External library
import datetime
import pyodbc as db
import os
import gzip
import csv
import shutil
def task():
#Authentication parameters
headers = { 'AccountKey' : 'secret',
'accept' : 'application/json'} #this is by default
#API parameters
uri = 'http://somewebsite.com/' #Resource URL
path = '/something/TrafficIncidents?'
#Build query string & specify type of API call
target = urlparse(uri + path)
print(target.geturl())
method = 'GET'
body = ''
#Get handle to http
h = http.Http()
#Obtain results
response, content = h.request(target.geturl(), method, body, headers)
api_call_time = datetime.datetime.now()
filename = "traffic_incidents_" + str(datetime.datetime.today().strftime('%Y-%m-%d'))
createHeader = 1
if os.path.exists(filename + '.csv'):
csvFile = open(filename + '.csv', 'a')
createHeader = 0
else:
#compress previous day's file
prev_filename = "traffic_incidents_" + (datetime.datetime.today()-datetime.timedelta(days=1)).strftime('%Y-%m-%d')
if os.path.exists(prev_filename + '.csv'):
with open(prev_filename + '.csv' , 'rb') as f_in, gzip.open(prev_filename + '.csv.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(prev_filename + '.csv')
#create new csv file for writing
csvFile = open(filename + '.csv', 'w')
#Parse JSON to print
jsonObj = json.loads(content)
print (json.dumps(jsonObj, sort_keys=True, indent=4))
with open("traffic_incidents.json","w") as outfile:
#Saving jsonObj["d"]
json.dump(jsonObj, outfile, sort_keys=True, indent=4,ensure_ascii=False)
for i in range(len(jsonObj["value"])):
jsonObj["value"][i]["IncidentTime"] = jsonObj["value"][i]["Message"].split(' ',1)[0]
jsonObj["value"][i]["Message"] = jsonObj["value"][i]["Message"].split(' ',1)[1]
jsonObj["value"][i]["ApiCallTime"] = api_call_time
#Save to csv file
header = jsonObj["value"][0].keys()
csvwriter = csv.writer(csvFile,lineterminator='\n')
if createHeader == 1:
csvwriter.writerow(header)
for i in range(len(jsonObj["value"])):
csvwriter.writerow(jsonObj["value"][i].values())
csvFile.close()
t = threading.Timer(120,task)
t.start()
while True:
try:
task()
except IndexError:
pass
else:
break
I get the following error and the program stops:
"header = jsonObj["value"][0].keys()
IndexError: list index out of range"
I would like the program to keep running even after the IndexError has occured.
How can I edit the code to achieve that?
I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:
{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}
which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?
This is my code
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26¤t_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
Interpret the response you get back (you already know it's json) and check if the data you want is there.
...
content = response.read()
data = json.loads(content)
if not data.get('results', {}).get('companies'):
break
...
Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.
import json
import os
from time import sleep
import requests
url = 'http://api.opencorporates.com/v0.2/companies/search'
token = 'ab123cd45'
rate = 20 # seconds to wait after rate limited
with open('codes') as f:
codes = [l.strip('\n') for l in f]
def get_page(code, page, **kwargs):
params = {
# 'api_token': token,
'jurisdiction_code': code,
'page': page,
}
params.update(kwargs)
while True:
r = requests.get(url, params=params)
try:
data = r.json()
except ValueError:
return None
if 'error' in data:
print data['error']['message']
sleep(rate)
continue
return data['results']
def dump_page(code, page, data):
with open(os.path.join(code, str(page) + '.json'), 'w') as f:
json.dump(data, f)
for code in codes:
try:
os.makedirs(code)
except os.error:
pass
data = get_page(code, 1)
if data is None:
continue
dump_page(code, 1, data['companies'])
for page in xrange(1, int(data.get('total_pages', 1))):
data = get_page(code, page)
if data is None:
break
dump_page(code, page, data['companies'])
I think that actually this example is not "technically empty." It contains data and is therefore technically not empty. The data just does not include any fields that are useful to you. :-)
If you want your code to skip over responses that have uninteresting data, then just check whether the JSON has the necessary fields before writing any data:
content = response.read()
try:
json_content = json.loads(content)
if json_content['results']['total_count'] > 0:
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
except KeyError:
break
except ValueError:
break
etc. You might want to report the ValueError or the KeyError, but that's up to you.