I wanted to scrape a few pdfs from a great history crash course I used to read a long time ago. Sadly, the old website is down and I only managed to get the old html code from archive.org
(the links I got work fine, ex: https://drive.google.com/file/d/0BzRJiIvdbSoKcHpGUWJBUDZ2WDA/edit?usp=sharing).
This script is resulting in html files being downloaded, saying
,,We're sorry but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.”
Is there a way to bypass this? I tried putting a few random delays into the code so this might be insufficient or i might be on google's blacklist for now.
(the text.txt file can be found here https://filebin.net/k2qw09embamx05ey )
import requests
import time
import random
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
time.sleep(random.randrange(1,2))
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f = open('text.txt')
long_string = f.readlines()
interesting_strings = []
for item in long_string:
if 'drive.google' in item:
interesting_strings.append(item)
print(interesting_strings)
interesting_strings = interesting_strings[0]
interesting_strings = interesting_strings.split('https://web.archive.org/web/20161219093036/')
links = []
for item in interesting_strings:
if 'drive.google' in item:
idx = item.find('"')
links.append(item[:idx])
cntr = 1
for link in links:
print(link)
fname = './data/History_' + str(cntr)
file_id = link.split('/')[-2]
print('id:', file_id)
destination = fname
download_file_from_google_drive(file_id, destination)
print('Getting file #', str(cntr))
cntr += 1
time.sleep(random.randrange(3,15) + random.random())
Use gdown:
import gdown
file_id = '0BzRJiIvdbSoKcHpGUWJBUDZ2WDA'
filename = 'file.pdf'
url = 'https://drive.google.com/uc?id=' + file_id
gdown.download(url, filename, quiet=False)
Related
I have this code that can pull images off of Pexels, but I don't know how to change it to video. I haven't seen anyone do this before and any help greatly appreciated. I tried switching all the photo tags to videos but that seemed not to work. I've also tried adding more libraries but that doesn't seem to work either.
import argparse
import json
import os
import time
import requests
import tqdm
from pexels_api import API
PEXELS_API_KEY = os.environ['PEXELS_KEY']
MAX_IMAGES_PER_QUERY = 100
RESULTS_PER_PAGE = 10
PAGE_LIMIT = MAX_IMAGES_PER_QUERY / RESULTS_PER_PAGE
def get_sleep(t):
def sleep():
time.sleep(t)
return sleep
def main(args):
sleep = get_sleep(args.sleep)
api = API(PEXELS_API_KEY)
query = args.query
page = 1
counter = 0
photos_dict = {}
# Step 1: Getting urls and meta information
while page <= PAGE_LIMIT:
api.search(query, page=page, results_per_page=RESULTS_PER_PAGE)
photos = api.get_entries()
for photo in tqdm.tqdm(photos):
photos_dict[photo.id] = vars(photo)['_Photo__photo']
counter += 1
if not api.has_next_page:
break
page += 1
sleep()
print(f"Finishing at page: {page}")
print(f"Images were processed: {counter}")
# Step 2: Downloading
if photos_dict:
os.makedirs(args.path, exist_ok=True)
# Saving dict
with open(os.path.join(args.path, f'{query}.json'), 'w') as fout:
json.dump(photos_dict, fout)
for val in tqdm.tqdm(photos_dict.values()):
url = val['src'][args.resolution]
fname = os.path.basename(val['src']['original'])
image_path = os.path.join(args.path, fname)
if not os.path.isfile(image_path): # ignore if already downloaded
response = requests.get(url, stream=True)
with open(image_path, 'wb') as outfile:
outfile.write(response.content)
else:
print(f"File exists: {image_path}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, required=True)
parser.add_argument('--path', type=str, default='./results_pexels')
parser.add_argument('--resolution', choices=['original', 'large2x', 'large',
'medium', 'small', 'portrait',
'landscape', 'tiny'], default='original')
parser.add_argument('--sleep', type=float, default=0.1)
args = parser.parse_args()
main(args)
sorry for bumping into the question. I just faced a similar situation when downloading the videos from Pexels using the python API, pexelsPy. This may be helpful:
I retrieved the ID of the videos and then created the downloading URL that has the following structure: "https://www.pexels.com/video/"+ ID +"/download".
See the following example:
def download_video(type_of_videos):
video_tag = random.choice(type_of_videos)
PEXELS_API = '-' #please add your API Key here
api = API(PEXELS_API)
retrieved_videos = read_already_download_files('downloaded_files.txt')
video_found_flag = True
num_page = 1
while video_found_flag:
api.search_videos(video_tag, page=num_page, results_per_page=10)
videos = api.get_videos()
for data in videos:
if data.width > data.height: #look for horizontal orientation videos
if data.url not in retrieved_videos:
# write_file('downloaded_files.txt', data.url)
url_video = 'https://www.pexels.com/video/' + str(data.id) + '/download' #create the url with the video id
r = requests.get(url_video)
with open(data.url.split('/')[-2]+'.mp4', 'wb') as outfile:
outfile.write(r.content)
return data.url.split('/')[-2]+'.mp4' #download the video
num_page += 1
download_video function takes an array of strings with several tags, e.g.: ['happy','sad','relax']. Then it randomly chooses one of these tags.
PEXELS_API should contain your API Key.
read_already_download_files('downloaded_files.txt'): Retrieves already downloaded files to check if the current found file is already downloaded.
from pypexels import PyPexels
import requests
api_key = 'api id'
# instantiate PyPexels object
py_pexel = PyPexels(api_key=api_key)
search_videos_page = py_pexel.videos_search(query="love", per_page=40)
# while True:
for video in search_videos_page.entries:
print(video.id, video.user.get('name'), video.url)
data_url = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(data_url)
print(r.headers.get('content-type'))
with open('sample.mp4', 'wb') as outfile:
outfile.write(r.content)
# if not search_videos_page.has_next:
break
# search_videos_page = search_videos_page.get_next_page()
I just tried to do the same. When I was looking for it, I wanted a simple example. All other fancy stuff I was sure I could add myself. So, I built upon inou's answer. The shown example is very basic and requests one page with only 5 results using the 'Tiger' tag in the search query. I download the first video using its id provided by the response and simply write it to the source folder. The api is provided by pexelsPy and the request is executed using the standard requests package. To get access to the API, you need to create a key on pexels website (see here). Once you get your own API key, you should be able to simply substitute the shown example key and run the code as a test.
import pexelsPy
import requests
PEXELS_API = '16gv62567257256iu78krtuzwqsddudrtjberzabzwzjsrtgswnr'
api = pexelsPy.API(PEXELS_API)
api.search_videos('Tiger', page=1, results_per_page=5)
videos = api.get_videos()
url_video = 'https://www.pexels.com/video/' + str(videos[0].id) + '/download'
r = requests.get(url_video)
with open('test.mp4', 'wb') as outfile:
outfile.write(r.content)
You can download multiple videos with this code :
import pexelsPy
import requests
PEXELS_API = '-'
api = pexelsPy.API(PEXELS_API)
api.search_videos('nature', page=2, results_per_page=100, orientation='landscape')
videos = api.get_videos()
for i, video in enumerate(videos):
url_video = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(url_video)
with open(f'test_{i}.mp4', 'wb') as outfile:
outfile.write(r.content)
This will download 100 videos, with each video being written to a separate file named test_0.mp4, test_1.mp4, ..., test_99.mp4.
This question has been asked earlier in the following link:
How to write dynamodb scan data's in CSV and upload to s3 bucket using python?
I have amended the code as advised in the comments. The code looks like as follows:
import csv
import boto3
import json
dynamodb = boto3.resource('dynamodb')
db = dynamodb.Table('employee_details')
def lambda_handler(event, context):
AWS_BUCKET_NAME = 'session5cloudfront'
s3 = boto3.resource('s3')
bucket = s3.Bucket(AWS_BUCKET_NAME)
path = '/tmp/' + 'employees.csv'
try:
response = db.scan()
myFile = open(path, 'w')
for i in response['Items']:
csv.register_dialect('myDialect', delimiter=' ', quoting=csv.QUOTE_NONE)
with myFile:
writer = csv.writer(myFile, dialect='myDialect')
writer.writerows(i)
print(i)
except :
print("error")
bucket.put_object(
ACL='public-read',
ContentType='application/csv',
Key=path,
# Body=json.dumps(i),
)
# print("here")
body = {
"uploaded": "true",
"bucket": AWS_BUCKET_NAME,
"path": path,
}
# print("then here")
return {
"statusCode": 200,
"body": json.dumps(body)
}
I am a novice, please help me in fixing this code as it is having problem in inserting data in file created in S3 Bucket.
Thanks
I have revised the code to be simpler and to also handle paginated responses for tables with more than 1MB of data:
import csv
import boto3
import json
TABLE_NAME = 'employee_details'
OUTPUT_BUCKET = 'my-bucket'
TEMP_FILENAME = '/tmp/employees.csv'
OUTPUT_KEY = 'employees.csv'
s3_resource = boto3.resource('s3')
dynamodb_resource = boto3.resource('dynamodb')
table = dynamodb_resource.Table(TABLE_NAME)
def lambda_handler(event, context):
with open(TEMP_FILENAME, 'w') as output_file:
writer = csv.writer(output_file)
header = True
first_page = True
# Paginate results
while True:
# Scan DynamoDB table
if first_page:
response = table.scan()
first_page = False
else:
response = table.scan(ExclusiveStartKey = response['LastEvaluatedKey'])
for item in response['Items']:
# Write header row?
if header:
writer.writerow(item.keys())
header = False
writer.writerow(item.values())
# Last page?
if 'LastEvaluatedKey' not in response:
break
# Upload temp file to S3
s3_resource.Bucket(OUTPUT_BUCKET).upload_file(TEMP_FILENAME, OUTPUT_KEY)
I'm trying to update an existing image from a product in prestashop. I'm using Python and Requests and the following code:
import requests
import io
import mimetypes
from PIL import Image
from StringIO import StringIO
api_key = 'test'
url = "https://.../api/images/products/249/445"
file_name = 't3_6kxvzv.jpg'
fd = io.open(file_name, "rb")
content = fd.read()
fd.close()
def encode_multipart_formdata():
"""Encode files to an http multipart/form-data.
:param files: a sequence of (type, filename, value)
elements for data to be uploaded as files.
:return: headers and body.
"""
BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
CRLF = '\r\n'
L = []
L.append('--' + BOUNDARY)
L.append(
'Content-Disposition: form-data; \
name="%s"; filename="%s"' % ("image", file_name))
L.append('Content-Type: %s' % get_content_type(file_name))
L.append('')
L.append(content)
L.append('--' + BOUNDARY + '--')
L.append('')
body = CRLF.join(L)
headers = {
'Content-Type': 'multipart/form-data; boundary=%s' % BOUNDARY
}
return headers, body
def get_content_type(file_name):
"""Retrieve filename mimetype.
:param filename: file name.
:return: mimetype.
"""
return mimetypes.guess_type(file_name)[0] or 'application/octet- stream'
header, body = encode_multipart_formdata()
r = requests.put(url, data=body, auth=(api_key,""), headers= header)
# also tried data = content
r = requests.get(url, auth=(api_key,""))
i = Image.open(StringIO(r.content))
i.show()
I tried various PUT and POST requests with
data = content
but getting only a 400 status code.
I then tried to GET the existing image, which works fine.
The api_key has all the necessary setting to allow PUT and POST.
I then tried to read into how prestapyt is solving this problem, however after importing prestapyt I couldn't follow their documentation to add an image to a product using:
prestashop.add("https://...api/images/products/249/445", files[('image',file_name,content)])
produces:
KeyError: ('image', 't3_6kxvzv.jpg', '\xff\xd8\xff\xe0\x00\x10JFI...
I tried then to modify the encode_multipart_formdata and get_content_type functions to produce a similar solution, but cannot get past the 400 status code.
I would very much prefer to use Requests and try to understand how to update a picture to using prestapyt and a turn-key solution.
Thank you for your time!
Documentation I used:
Prestashop http://doc.prestashop.com/display/PS16/Chapter+9+-+Image+management
prestapyt https://github.com/prestapyt/prestapyt
UPDATE:
I was able to use Requests and POST to add an image to a product via:
url_2 = "https:/.../api/images/products/249"
r = requests.post(url_2, data=body, auth=(api_key,""), headers=header)
Still not able to use PUT to change or update an image.
This answer comes a little bit late, but here it is. You're reinventing the wheel, altough it's being interesting seeing how: now I understand how to build a multipart form data from scratch. I tried your code and it fails since youre joining str and bytes in your encode_multipart_formdata function:
L.append(content)
That line will raise a TypeError exception.
Requests can post multipart form data in a very simple way:
files = {'image': ('../imagepath/imagename.jpg', open('../imagepath/imagename.jpg', 'rb'), 'image/jpg')}
body, content_type = requests.models.RequestEncodingMixin._encode_files(files, {})
headers = {
"Content-Type": content_type
}
r=requests.post( url + "images/products/" + str(product_id),data=body, headers=headers)
print(r.text)
This has been tested with Python 3.7 and PrestaShop 1.7.8.2.
Couldn't get PUT to work, so instead used DELETE and POST
import requests
import io
import mimetypes
import xml.etree.ElementTree as ET
import sys
api = ''
urls =["https://.../api/images/products/22",
"https://.../api/images/products/31",
"https://.../api/images/products/37",
"https://.../api/images/products/46",
"https://.../api/images/products/212"]
def encode_multipart_formdata(file_name,content):
"""Encode files to an http multipart/form-data.
:param files: a sequence of (type, filename, value)
elements for data to be uploaded as files.
:return: headers and body.
"""
BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
CRLF = '\r\n'
L = []
L.append('--' + BOUNDARY)
L.append(
'Content-Disposition: form-data; \
name="%s"; filename="%s"' % ("image", file_name))
L.append('Content-Type: %s' % get_content_type(file_name))
L.append('')
L.append(content)
L.append('--' + BOUNDARY + '--')
L.append('')
body = CRLF.join(L)
headers = {
'Content-Type': 'multipart/form-data; boundary=%s' % BOUNDARY
}
return headers, body
def get_content_type(file_name):
"""Retrieve filename mimetype.
:param filename: file name.
:return: mimetype.
"""
return mimetypes.guess_type(file_name)[0] or 'application/octet-stream'
def get_image_url(url):
"""get from a given url the image url"""
r = requests.get(url, auth=(api,""))
tree = ET.fromstring(r.content)
return tree.find("image").find("declination").get("{http://www.w3.org/1999/xlink}href")
def delete_image(url):
"""deletes the image on prestashop given by url"""
url2 = get_image_url(url)
requests.delete(url2, auth=(api,""))
def load_image(file_name):
"""loads image to upload"""
fd = io.open(file_name, "rb")
content = fd.read()
fd.close()
return content, file_name
def upload_image(url, file_name):
"""uploads new image to a given url"""
content, file_name = load_image(file_name)
header, body = encode_multipart_formdata(file_name, content)
requests.post(url, data=body, auth=(api,""), headers=header)
if __name__ == "__main__":
file_name = sys.argv[1]
content, file_name = load_image(file_name)
for i in urls:
delete_image(i)
upload_image(i,file_name)
Workaround works fine, still don't understand why there has to be such a complicated way and why PUT doesn't work.
I'm currently using the session.get() code found at this stackoverflow. When I save the drive files they don't have a filetype suffix, so I have to add one manually to based on file type to open it. Is there a way I can parse the chunk and get filetype variable or maybe even search by hexcode? Better method?
import requests
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if __name__ == "__main__":
file_id = 'TAKE ID FROM SHAREABLE LINK'
destination = 'DESTINATION FILE ON YOUR DISK'
download_file_from_google_drive(file_id, destination)
Source: Python: download files from google drive using url
via #user6770522
I used beatifulSoup and findAll to grab the 'title' tag. Then applied it to destination by os path join.
for filename in soup.findAll('title'):
link = filename.string
filename = link.replace(' - Google Drive','')
destination = os.path.join(dir,filename)
file_path = os.path.join(year_name, destination)
drive_pull.download_file_from_google_drive(url_id, file_path)
I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:
{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}
which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?
This is my code
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26¤t_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
Interpret the response you get back (you already know it's json) and check if the data you want is there.
...
content = response.read()
data = json.loads(content)
if not data.get('results', {}).get('companies'):
break
...
Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.
import json
import os
from time import sleep
import requests
url = 'http://api.opencorporates.com/v0.2/companies/search'
token = 'ab123cd45'
rate = 20 # seconds to wait after rate limited
with open('codes') as f:
codes = [l.strip('\n') for l in f]
def get_page(code, page, **kwargs):
params = {
# 'api_token': token,
'jurisdiction_code': code,
'page': page,
}
params.update(kwargs)
while True:
r = requests.get(url, params=params)
try:
data = r.json()
except ValueError:
return None
if 'error' in data:
print data['error']['message']
sleep(rate)
continue
return data['results']
def dump_page(code, page, data):
with open(os.path.join(code, str(page) + '.json'), 'w') as f:
json.dump(data, f)
for code in codes:
try:
os.makedirs(code)
except os.error:
pass
data = get_page(code, 1)
if data is None:
continue
dump_page(code, 1, data['companies'])
for page in xrange(1, int(data.get('total_pages', 1))):
data = get_page(code, page)
if data is None:
break
dump_page(code, page, data['companies'])
I think that actually this example is not "technically empty." It contains data and is therefore technically not empty. The data just does not include any fields that are useful to you. :-)
If you want your code to skip over responses that have uninteresting data, then just check whether the JSON has the necessary fields before writing any data:
content = response.read()
try:
json_content = json.loads(content)
if json_content['results']['total_count'] > 0:
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
except KeyError:
break
except ValueError:
break
etc. You might want to report the ValueError or the KeyError, but that's up to you.