import requests
s = ['jobs','careers','opportunities']
u = ['yahoo.com','statestreet.com']
f = []
for i in u:
for j in s:
w = "http://{}/{}".format(i,j)
print w
r = requests.get(w)
print r.status_code
if(r.status_code == 200):
f.append(w)
break
print f
This code is working for most websites but not for websites like www.surveymonkey.com.
You don't state what isn't working, or what you expect the code to do. However, the indentation in the final if statement in your code seems incorrect.
Correcting that (and editing the variable names to be more descriptive as follows) at least runs as expected for me:
import requests
paths = ['jobs', 'careers', 'opportunities']
domains = ['yahoo.com', 'statestreet.com', 'surveymonkey.com']
results = []
for domain in domains:
for path in paths:
url = "http://{}/{}".format(domain, path)
print url
r = requests.get(url)
print r.status_code
if r.status_code == requests.codes.OK:
results.append(url)
break
print results
Related
I would like to use a list of URLs to process their data using python. I can obtain the information using one URL but I would like to from many.
The following code gets me the data that I need although I am unable to use a list for example where #myString I am on day few learning python.
import requests
#myString = ['https://{redacted by me}/vessels/amadi_9682552_10003796/,https://{redacted by me}/vessels/akebono-maru_9554729_2866687/,https://{redacted by me}/vessels/amani_9661869_9276632/,https://{redacted by me}/vessels/aman-sendai_9134323_2017277/,https://{redacted by me}/vessels/al-aamriya_9338266_25273/}
r = requests.get('https://{redacted by me}/vessels/amadi_9682552_10003796/')
if r.status_code == 200:
print(r.status_code)
elif r.status_code == 300:
print(r.status_code) #post url seperately to a defined api -- future
elif r.status_code == 404:
print(r.status_code) #post url seperately to a defined api -- future for removal
data = r.content
data = data.decode("utf-8")
#print(data)
next_port_locode = data.split('locode: "')[1].split('"')[0].strip()
next_port_iso2 = data.split('iso2: "')[1].split('"')[0].strip()
next_port_name = data.split('iso2: "')[1].split('name: "')[1].split('"')[0].strip()
next_port_eta = data.split('eta: moment("')[1].split('"')[0].strip()
next_port_latitude = float(data.split('latitude: ')[1].split(',')[0].strip())
next_port_longitude = float(data.split('longitude: ')[1].split('\n')[0].strip())
datajson = {
"next_port_locode": next_port_locode,
"next_port_iso2": next_port_iso2,
"next_port_name": next_port_name,
"next_port_eta": next_port_eta,
"next_port_latitude": next_port_latitude,
"next_port_longitude": next_port_longitude,
}
print(datajson)
requests.post("https://{redacted by me}/api/Moments", json=datajson)
Just need to work on your Python basics, this should do it. Check out using lists and for loops. Good luck!
import requests
myString = ['https://{redacted by me}/vessels/amadi_9682552_10003796/','https://{redacted by me}/vessels/akebono-maru_9554729_2866687/','https://{redacted by me}/vessels/amani_9661869_9276632/','https://{redacted by me}/vessels/aman-sendai_9134323_2017277/','https://{redacted by me}/vessels/al-aamriya_9338266_25273/'}
for myURL in myString:
r = requests.get(myURL)
if r.status_code == 200:
.
.
.
I'm scraping from the World Bank for a paper and I'm trying to make a loop of the web scraping of different indicators but I can't seem to make it work until a certain part of the code. Hope someone can help please?
#Single Code for each indicator
indcator = 'SP.POP.TOTL?date=2000:2020'
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % indicator
response = requests.get(url)
print(response)
result = response.content
result = json.loads(result)
pop_total_df = pd.DataFrame.from_dict(result[1])
This is the loop i'm trying to build but I got an error in the last part of below code:
#indicator list
indicator = {'FP.CPI.TOTL.ZG?date=2000:2020','SP.POP.TOTL?date=2000:2020'}
#list of urls with the indicators
url_list = []
for i in indicator:
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % i
url_list.append(url)
result_list = []
for i in url_list:
response = requests.get(i)
print(response)
result_list.append(response.content)
#Erroneous code
result_json = []
for i in range(3):
result_json.append(json.loads(result_list[i])))
As you are making 2 requests (FP.CPI.TOTL.ZG?date=2000:2020 and SP.POP.TOTL?date=2000:2020) your result_list length is 2, so its index are 0 and 1. Use range(2) or range(len(result_list)) instead:
import requests, json
#indicator list
indicator = {'FP.CPI.TOTL.ZG?date=2000:2020','SP.POP.TOTL?date=2000:2020'}
#list of urls with the indicators
url_list = []
for i in indicator:
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % i
url_list.append(url)
result_list = []
for i in url_list:
response = requests.get(i)
print(response)
result_list.append(response.content)
#Erroneous code
result_json = []
for i in range(len(result_list)):
result_json.append(json.loads(result_list[i]))
This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.
I am new to python and just wanted to know if this is possible: I have scraped a url using urllib and want to edit different pages.
Example:
http://test.com/All/0.html
I want the 0.html to become 50.html and then 100.html and so on ...
found_url = 'http://test.com/All/0.html'
base_url = 'http://test.com/All/'
for page_number in range(0,1050,50):
url_to_fetch = "{0}{1}.html".format(base_url,page_number)
That should give you URLs from 0.html to 1000.html
If you want to use urlparse(as suggested in comments to your question):
import urlparse
found_url = 'http://test.com/All/0.html'
parsed_url = urlparse.urlparse(found_url)
path_parts = parsed_url.path.split("/")
for page_number in range(0,1050,50):
new_path = "{0}/{1}.html".format("/".join(path_parts[:-1]), page_number)
parsed_url = parsed_url._replace(path= new_path)
print parsed_url.geturl()
Executing this script would give you the following:
http://test.com/All/0.html
http://test.com/All/50.html
http://test.com/All/100.html
http://test.com/All/150.html
http://test.com/All/200.html
http://test.com/All/250.html
http://test.com/All/300.html
http://test.com/All/350.html
http://test.com/All/400.html
http://test.com/All/450.html
http://test.com/All/500.html
http://test.com/All/550.html
http://test.com/All/600.html
http://test.com/All/650.html
http://test.com/All/700.html
http://test.com/All/750.html
http://test.com/All/800.html
http://test.com/All/850.html
http://test.com/All/900.html
http://test.com/All/950.html
http://test.com/All/1000.html
Instead of printing in the for loop you can use the value of parsed_url.geturl() as per your need. As mentioned, if you want to fetch the content of the page you can use python requests module in the following manner:
import requests
found_url = 'http://test.com/All/0.html'
parsed_url = urlparse.urlparse(found_url)
path_parts = parsed_url.path.split("/")
for page_number in range(0,1050,50):
new_path = "{0}/{1}.html".format("/".join(path_parts[:-1]), page_number)
parsed_url = parsed_url._replace(path= new_path)
# print parsed_url.geturl()
url = parsed_url.geturl()
try:
r = requests.get(url)
if r.status_code == 200:
with open(str(page_number)+'.html', 'w') as f:
f.write(r.content)
except Exception as e:
print "Error scraping - " + url
print e
This fetches the content from http://test.com/All/0.html till http://test.com/All/1000.html and saves the content of each URL into its own file. The file name on disk would be the file name in URL - 0.html to 1000.html
Depending on the performance of the site you are trying to scrape from you might experience considerable time delays in running the script. If performance is of importance, you can consider using grequests
I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.