I'm writing a short piece of code in python to check the status code of a list of URLS. The steps are
1. read the URL's from a csv file.
2. Check request code
3. Write the status code request into the csv next to the checked URL
The first two steps I've managed to do but I'm stuck with writing the output of the requests into the same csv, next to the urls. Please help.
import urllib.request
import urllib.error
from multiprocessing import Pool
file = open('innovators.csv', 'r', encoding="ISO-8859-1")
urls = file.readlines()
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
print('200' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=1)
result = p.map(checkurl, urls)
with open('innovators.csv', 'w') as f:
for line in file:
url = ''.join(line)
checkurl(urls + "," + checkurl)
The .readlines() operation leaves the file object at the end of file. When you attempt to loop through the lines of file again, without first rewinding it (file.seek(0)) or closing and opening it again (file.close() followed by opening again), there are no lines remaining. Always recommended to use with open(...) as file construct to ensure file is closed when operation is finished.
Additionally, there appears to be an error in your input to checkurl. You have added a list (urls) to a string (",") to a function (checkurl).
You probably meant for this section to read
with open('innovators.csv', 'w') as f:
for line in urls:
url = ''.join(line.replace('\n','')) # readlines leaves linefeed character at end of line
f.write(url + "," + checkurl(url))
The checkurl function should return what you are intending to place into the csv file. You are simply printing to standard output (screen). Thus, replace your checkurl with
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
ret='0'
except urllib.error.HTTPError as e:
ret='HTTPError: {}'.format(e.code)
except urllib.error.URLError as e:
ret='URLError: {}'.format(e.reason)
else:
ret='200'
return ret
or something equivalent to your needs.
Save the status in a dict. and convert it to dataframe. Then simply send it to a csv file. str(code.getcode()) will return 200 if the url is connecting else it will return an exception, for which i assigned status as '000'. So your csv file will contain url,200 if URL is connecting and url,000 if URL is not connecting.
status_dict={}
for line in lines:
try:
code = urllib.request.urlopen(line)
status = str(code.getcode())
status_dict[line] = status
except:
status = "000"
status_dict[line] = status
df = pd.Dataframe(status_dict)
df.to_csv('filename.csv')
Related
I am trying to download files using python and then add lines at the end of the downloaded files, but it returns an error:
f.write(data + """<auth-user-pass>
TypeError: can't concat str to bytes
Edit: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
I also tried something like this but it also did not work: f.write(str(data) + "< auth-user-pass >")
here is my full code:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
for r in results:
print(r)
EDIT: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
Try this:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data)
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
with open(complete_path, 'ab') as f:
f.write(b"""<auth-user-pass>
username
password
</auth-user-pass>""")
for r in results:
print(r)
You are using binary mode, encode your string before concat, that is replace
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
using
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""".encode())
You open the file as a write in binary.
Because of that you cant use normal strings like the comment from #user56700 said.
You either need to convert the string or open it another way(ex. 'a' = appending).
Im not completly sure but it is also possible that the write binary variant of open the data of the file deletes. Normally open with write deletes existing data, so its quite possible that you need to change it to 'rwb'.
I can't get my program to get every string possible from a split.
Here is one thing I tried:
var2 = "apple banana orange"
for var in var2.split():
#Here I would put what I want to do with the variable, but I put print() to show what happens
print(var)
I got:
applebananaorange
Full Code:
import requests
response = requests.get('https://raw.githubusercontent.com/Charonum/JSCode/main/Files.txt')
responsecontent = str(response.content)
for file in responsecontent.split("\n"):
file = file.replace("b'", "")
file = file.replace("'", "")
file = file.replace(r"\n", "")
if file == "":
pass
else:
print(file)
url = 'https://raw.githubusercontent.com/Charonum/JSCode/main/code/windows/' + file + ""
wget.download(url)
What should I do?
It looks like one of the files in the list is not available. It is good practice to always wrap input/output operations with a try/except to control problems like this. The code below downloads all available files and informs you which files could not be downloaded:
import requests
import wget
from urllib.error import HTTPError
response = requests.get('https://raw.githubusercontent.com/Charonum/JSCode/main/Files.txt')
responsecontent = str(response.content)
for file in responsecontent.split("\\n"):
file = file.replace("b'", "")
file = file.replace("'", "")
file = file.replace(r"\n", "")
if file == "":
pass
else:
url = 'https://raw.githubusercontent.com/Charonum/JSCode/main/code/windows/' + file + ""
print(url)
try:
wget.download(url)
except HTTPError:
print(f"Error 404: {url} not found")
It seems to work for me replacing the for statement with this one:
for file in responsecontent.split("\\n"):
...
Instead of responsecontent = str(response.content) try:
responsecontent = response.text
and then for file in responsecontent.split().
I am one step before finishing a project. As far as I know, all parts of the code works, and I have tested them separately. However, the output CSV still comes out empty for some reason. My code:
import requests, bs4, csv, sys
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'http://www.constructeursdefrance.com/resultat/?dpt=01'
count = 1
def result():
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'html.parser')
links = []
try:
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
for i in links:
res2 = requests.get(i)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
writer.writerow([each.text, each.next_sibling])
except:
pass
while not url.endswith('?dpt=010'):
print 'downloading %s' %url
result()
count += 1
url = 'http://www.constructeursdefrance.com/resultat/?dpt=0' + str(count)
url = 'http://www.constructeursdefrance.com/resultat/?dpt=10'
count = 10
while not url.endswith('?dpt=102'):
print 'downloading %s' %url
result()
count += 1
url = 'http://www.constructeursdefrance.com/resultat/?dpt=' + str(count)
print 'done'
This is really one of the first bigger projects I am trying to solve as a beginner. Being so close yet so stuck is frustrating, however. Any help is appreciated.
first, do not use try except in a large block, just use in a small place.
if you comment of you try except statement, this error will raise:
Traceback (most recent call last):
File "/home/li/PycharmProjects/tw/1.py", line 29, in <module>
result()
File "/home/li/PycharmProjects/tw/1.py", line 26, in result
writer.writerow([each.text, each.next_sibling])
TypeError: a bytes-like object is required, not 'str'
and this error message is clear, when it write to file, it require a bytes_like object, and you can check that the file you opened is in 'wb' mode, 'b' represent bytes mode, so the problem is clear, just change the mode to normal mode which require a str_like object:
with open('french.csv', 'w') as file:
Im trying to write different things onto a text file in a while loop but it only writes it once. I want to write something to unmigrated.txt
import urllib.request
import json
Txtfile = input("Name of the TXT file: ")
fw = open(Txtfile + ".txt", "r")
red = fw.read()
blue = red.split("\n")
i=0
while i<len(blue):
try:
url = "https://api.mojang.com/users/profiles/minecraft/" + blue[i]
rawdata = urllib.request.urlopen(url)
newrawdata = rawdata.read()
jsondata = json.loads(newrawdata.decode('utf-8'))
results = jsondata['id']
url_uuid = "https://sessionserver.mojang.com/session/minecraft/profile/" + results
rawdata_uuid = urllib.request.urlopen(url_uuid)
newrawdata_uuid = rawdata_uuid.read()
jsondata_uuid = json.loads(newrawdata_uuid.decode('utf-8'))
try:
results = jsondata_uuid['legacy']
print (blue[i] + " is " + "Unmigrated")
wf = open("unmigrated.txt", "w")
wring = wf.write(blue[i] + " is " + "Unmigrated\n")
except:
print(blue[i] + " is " + "Migrated")
except:
print(blue[i] + " is " + "Not-Premium")
i+=1
You keep overwriting opening the file with w inside the loop so you only see the last data that was written to the file, either open the file once outside the loop or open with a to append. Opening once would be the simplest approach, you can also use range instead of your while or better again just iterate over the list:
with open("unmigrated.txt", "w") as f: # with close your file automatically
for ele in blue:
.....
Also wring = wf.write(blue[i] + " is " + "Unmigrated\n") sets wring to None which is what write returns so probably not of any real use.
Lastly using a blank expect is usually never a good idea, catch the specific exceptions you expect and log or at least print when you get an error.
Using the requests library, I would break up your code doing something like:
import requests
def get_json(url):
try:
rawdata = requests.get(url)
return rawdata.json()
except requests.exceptions.RequestException as e:
print(e)
except ValueError as e:
print(e)
return {}
txt_file = input("Name of the TXT file: ")
with open(txt_file + ".txt") as fw, open("unmigrated.txt", "w") as f: # with close your file automatically
for line in map(str.rstrip, fw): # remove newlines
url = "https://api.mojang.com/users/profiles/minecraft/{}".format(line)
results = get_json(url).get("id")
if not results:
continue
url_uuid = "https://sessionserver.mojang.com/session/minecraft/profile/{}".format(results)
results = get_json(url_uuid).get('legacy')
print("{} is Unmigrated".format(line))
f.write("{} is Unmigrated\n".format(line))
I am not sure where 'legacy' fits into the code, that logic I will leave to you. You can also iterate directly over the file object so you can forget about splitting the lines into blue.
try:
with open("filename", "w") as f:
f.write("your content")
But that will overwrite all contents of the file.
Instead, if you want to append to the file use:
with open("filename", "a") as f:
If you choose to not use the with syntax, remember to close the file.
Read more here:
https://docs.python.org/2/library/functions.html#open
I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:
{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}
which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?
This is my code
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26¤t_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
Interpret the response you get back (you already know it's json) and check if the data you want is there.
...
content = response.read()
data = json.loads(content)
if not data.get('results', {}).get('companies'):
break
...
Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.
import json
import os
from time import sleep
import requests
url = 'http://api.opencorporates.com/v0.2/companies/search'
token = 'ab123cd45'
rate = 20 # seconds to wait after rate limited
with open('codes') as f:
codes = [l.strip('\n') for l in f]
def get_page(code, page, **kwargs):
params = {
# 'api_token': token,
'jurisdiction_code': code,
'page': page,
}
params.update(kwargs)
while True:
r = requests.get(url, params=params)
try:
data = r.json()
except ValueError:
return None
if 'error' in data:
print data['error']['message']
sleep(rate)
continue
return data['results']
def dump_page(code, page, data):
with open(os.path.join(code, str(page) + '.json'), 'w') as f:
json.dump(data, f)
for code in codes:
try:
os.makedirs(code)
except os.error:
pass
data = get_page(code, 1)
if data is None:
continue
dump_page(code, 1, data['companies'])
for page in xrange(1, int(data.get('total_pages', 1))):
data = get_page(code, page)
if data is None:
break
dump_page(code, page, data['companies'])
I think that actually this example is not "technically empty." It contains data and is therefore technically not empty. The data just does not include any fields that are useful to you. :-)
If you want your code to skip over responses that have uninteresting data, then just check whether the JSON has the necessary fields before writing any data:
content = response.read()
try:
json_content = json.loads(content)
if json_content['results']['total_count'] > 0:
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
except KeyError:
break
except ValueError:
break
etc. You might want to report the ValueError or the KeyError, but that's up to you.