I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:
{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}
which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?
This is my code
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26¤t_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
Interpret the response you get back (you already know it's json) and check if the data you want is there.
...
content = response.read()
data = json.loads(content)
if not data.get('results', {}).get('companies'):
break
...
Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.
import json
import os
from time import sleep
import requests
url = 'http://api.opencorporates.com/v0.2/companies/search'
token = 'ab123cd45'
rate = 20 # seconds to wait after rate limited
with open('codes') as f:
codes = [l.strip('\n') for l in f]
def get_page(code, page, **kwargs):
params = {
# 'api_token': token,
'jurisdiction_code': code,
'page': page,
}
params.update(kwargs)
while True:
r = requests.get(url, params=params)
try:
data = r.json()
except ValueError:
return None
if 'error' in data:
print data['error']['message']
sleep(rate)
continue
return data['results']
def dump_page(code, page, data):
with open(os.path.join(code, str(page) + '.json'), 'w') as f:
json.dump(data, f)
for code in codes:
try:
os.makedirs(code)
except os.error:
pass
data = get_page(code, 1)
if data is None:
continue
dump_page(code, 1, data['companies'])
for page in xrange(1, int(data.get('total_pages', 1))):
data = get_page(code, page)
if data is None:
break
dump_page(code, page, data['companies'])
I think that actually this example is not "technically empty." It contains data and is therefore technically not empty. The data just does not include any fields that are useful to you. :-)
If you want your code to skip over responses that have uninteresting data, then just check whether the JSON has the necessary fields before writing any data:
content = response.read()
try:
json_content = json.loads(content)
if json_content['results']['total_count'] > 0:
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
except KeyError:
break
except ValueError:
break
etc. You might want to report the ValueError or the KeyError, but that's up to you.
Related
I am trying to download files using python and then add lines at the end of the downloaded files, but it returns an error:
f.write(data + """<auth-user-pass>
TypeError: can't concat str to bytes
Edit: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
I also tried something like this but it also did not work: f.write(str(data) + "< auth-user-pass >")
here is my full code:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
for r in results:
print(r)
EDIT: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
Try this:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data)
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
with open(complete_path, 'ab') as f:
f.write(b"""<auth-user-pass>
username
password
</auth-user-pass>""")
for r in results:
print(r)
You are using binary mode, encode your string before concat, that is replace
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
using
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""".encode())
You open the file as a write in binary.
Because of that you cant use normal strings like the comment from #user56700 said.
You either need to convert the string or open it another way(ex. 'a' = appending).
Im not completly sure but it is also possible that the write binary variant of open the data of the file deletes. Normally open with write deletes existing data, so its quite possible that you need to change it to 'rwb'.
so here is my code, it creates the files which means it has access to write permissions but it just doesn't write anything inside, the function works and it prints on screen it just doesn't write anything, any idea why is that?
expected behavior: it to create two files and filter websites to working or non-working and state the error code for non-working ones
current behavior: it creates to empty files
import requests
from concurrent.futures import ThreadPoolExecutor
websites = []
f = open("websites.txt", "r")
for i in f:
if not i.startswith("http"):
i = "http://"+i
websites.append(i.strip())
print("appended" + i)
f.close()
with open("working.txt", "w") as fa, open("not_working.txt", "w") as fe:
def checker(website):
response = requests.get(website)
available = response.status_code == 200
print(response.status_code)
if available:
fa.write(website + "\n")
else:
fe.write(website + " error " + response.status_code + "\n")
with ThreadPoolExecutor() as executor:
executor.map(checker, websites)
Code
executor.map(checker, websites)
creates generator but it doesn't execute threads.
It needs at least list()
list( executor.map(checker, websites) )
to execute generator.
Full code:
import requests
from concurrent.futures import ThreadPoolExecutor
# --- functions ---
def read_urls(filename):
websites = []
with open(filename) as f:
for line in f:
line = line.strip()
if line: # skip empty lines
if not line.startswith("http"):
line = "http://" + line
websites.append(line)
print("appended:", line)
return websites
def checker(website):
response = requests.get(website)
print('[checker]:', response.status_code, website)
if response.status_code == 200:
fa.write(f'{website}\n')
else:
fe.write(f'{website} error {response.status_code}\n')
# --- main ---
#websites = read_urls("websites.txt")
websites = ['https://stackoverflow.com', 'https://fake.com']
with open("working.txt", "w") as fa, open("not_working.txt", "w") as fe:
with ThreadPoolExecutor() as executor:
list( executor.map(checker, websites) )
But it may be safer to return result from function and write in main thread. And it will write results in the same order as data in original file. Writing in thread may get it in different order because threads may run in different order.
import requests
from concurrent.futures import ThreadPoolExecutor
# --- functions ---
def read_urls(filename):
websites = []
with open(filename) as f:
for line in f:
line = line.strip()
if line: # skip empty lines
if not line.startswith("http"):
line = "http://" + line
websites.append(line)
print("appended:", line)
return websites
def checker(website):
response = requests.get(website)
print('[checker]:', response.status_code, website)
return website, response.status_code
# --- main ---
#websites = read_urls("websites.txt")
websites = ['https://stackoverflow.com', 'https://fake.com']
with open("working.txt", "w") as fa, open("not_working.txt", "w") as fe:
with ThreadPoolExecutor() as executor:
for website, status_code in executor.map(checker, websites):
print('[main]:', status_code, website)
if status_code == 200:
fa.write(f'{website}\n')
else:
fe.write(f'{website} error {status_code}\n')
I wanted to scrape a few pdfs from a great history crash course I used to read a long time ago. Sadly, the old website is down and I only managed to get the old html code from archive.org
(the links I got work fine, ex: https://drive.google.com/file/d/0BzRJiIvdbSoKcHpGUWJBUDZ2WDA/edit?usp=sharing).
This script is resulting in html files being downloaded, saying
,,We're sorry but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.”
Is there a way to bypass this? I tried putting a few random delays into the code so this might be insufficient or i might be on google's blacklist for now.
(the text.txt file can be found here https://filebin.net/k2qw09embamx05ey )
import requests
import time
import random
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
time.sleep(random.randrange(1,2))
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f = open('text.txt')
long_string = f.readlines()
interesting_strings = []
for item in long_string:
if 'drive.google' in item:
interesting_strings.append(item)
print(interesting_strings)
interesting_strings = interesting_strings[0]
interesting_strings = interesting_strings.split('https://web.archive.org/web/20161219093036/')
links = []
for item in interesting_strings:
if 'drive.google' in item:
idx = item.find('"')
links.append(item[:idx])
cntr = 1
for link in links:
print(link)
fname = './data/History_' + str(cntr)
file_id = link.split('/')[-2]
print('id:', file_id)
destination = fname
download_file_from_google_drive(file_id, destination)
print('Getting file #', str(cntr))
cntr += 1
time.sleep(random.randrange(3,15) + random.random())
Use gdown:
import gdown
file_id = '0BzRJiIvdbSoKcHpGUWJBUDZ2WDA'
filename = 'file.pdf'
url = 'https://drive.google.com/uc?id=' + file_id
gdown.download(url, filename, quiet=False)
I'm writing a short piece of code in python to check the status code of a list of URLS. The steps are
1. read the URL's from a csv file.
2. Check request code
3. Write the status code request into the csv next to the checked URL
The first two steps I've managed to do but I'm stuck with writing the output of the requests into the same csv, next to the urls. Please help.
import urllib.request
import urllib.error
from multiprocessing import Pool
file = open('innovators.csv', 'r', encoding="ISO-8859-1")
urls = file.readlines()
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
print('200' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=1)
result = p.map(checkurl, urls)
with open('innovators.csv', 'w') as f:
for line in file:
url = ''.join(line)
checkurl(urls + "," + checkurl)
The .readlines() operation leaves the file object at the end of file. When you attempt to loop through the lines of file again, without first rewinding it (file.seek(0)) or closing and opening it again (file.close() followed by opening again), there are no lines remaining. Always recommended to use with open(...) as file construct to ensure file is closed when operation is finished.
Additionally, there appears to be an error in your input to checkurl. You have added a list (urls) to a string (",") to a function (checkurl).
You probably meant for this section to read
with open('innovators.csv', 'w') as f:
for line in urls:
url = ''.join(line.replace('\n','')) # readlines leaves linefeed character at end of line
f.write(url + "," + checkurl(url))
The checkurl function should return what you are intending to place into the csv file. You are simply printing to standard output (screen). Thus, replace your checkurl with
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
ret='0'
except urllib.error.HTTPError as e:
ret='HTTPError: {}'.format(e.code)
except urllib.error.URLError as e:
ret='URLError: {}'.format(e.reason)
else:
ret='200'
return ret
or something equivalent to your needs.
Save the status in a dict. and convert it to dataframe. Then simply send it to a csv file. str(code.getcode()) will return 200 if the url is connecting else it will return an exception, for which i assigned status as '000'. So your csv file will contain url,200 if URL is connecting and url,000 if URL is not connecting.
status_dict={}
for line in lines:
try:
code = urllib.request.urlopen(line)
status = str(code.getcode())
status_dict[line] = status
except:
status = "000"
status_dict[line] = status
df = pd.Dataframe(status_dict)
df.to_csv('filename.csv')
I am querying an API from a website. The API will be down for maintenance from time to time and also, there may not be data available for querying at times. I have written the code to keep forcing the program to query the API even after an error, however it doesn't seem to be working.
The following is the code:
import threading
import json
import urllib
from urllib.parse import urlparse
import httplib2 as http #External library
import datetime
import pyodbc as db
import os
import gzip
import csv
import shutil
def task():
#Authentication parameters
headers = { 'AccountKey' : 'secret',
'accept' : 'application/json'} #this is by default
#API parameters
uri = 'http://somewebsite.com/' #Resource URL
path = '/something/TrafficIncidents?'
#Build query string & specify type of API call
target = urlparse(uri + path)
print(target.geturl())
method = 'GET'
body = ''
#Get handle to http
h = http.Http()
#Obtain results
response, content = h.request(target.geturl(), method, body, headers)
api_call_time = datetime.datetime.now()
filename = "traffic_incidents_" + str(datetime.datetime.today().strftime('%Y-%m-%d'))
createHeader = 1
if os.path.exists(filename + '.csv'):
csvFile = open(filename + '.csv', 'a')
createHeader = 0
else:
#compress previous day's file
prev_filename = "traffic_incidents_" + (datetime.datetime.today()-datetime.timedelta(days=1)).strftime('%Y-%m-%d')
if os.path.exists(prev_filename + '.csv'):
with open(prev_filename + '.csv' , 'rb') as f_in, gzip.open(prev_filename + '.csv.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(prev_filename + '.csv')
#create new csv file for writing
csvFile = open(filename + '.csv', 'w')
#Parse JSON to print
jsonObj = json.loads(content)
print (json.dumps(jsonObj, sort_keys=True, indent=4))
with open("traffic_incidents.json","w") as outfile:
#Saving jsonObj["d"]
json.dump(jsonObj, outfile, sort_keys=True, indent=4,ensure_ascii=False)
for i in range(len(jsonObj["value"])):
jsonObj["value"][i]["IncidentTime"] = jsonObj["value"][i]["Message"].split(' ',1)[0]
jsonObj["value"][i]["Message"] = jsonObj["value"][i]["Message"].split(' ',1)[1]
jsonObj["value"][i]["ApiCallTime"] = api_call_time
#Save to csv file
header = jsonObj["value"][0].keys()
csvwriter = csv.writer(csvFile,lineterminator='\n')
if createHeader == 1:
csvwriter.writerow(header)
for i in range(len(jsonObj["value"])):
csvwriter.writerow(jsonObj["value"][i].values())
csvFile.close()
t = threading.Timer(120,task)
t.start()
while True:
try:
task()
except IndexError:
pass
else:
break
I get the following error and the program stops:
"header = jsonObj["value"][0].keys()
IndexError: list index out of range"
I would like the program to keep running even after the IndexError has occured.
How can I edit the code to achieve that?