I am building a class to download files asynchronously.However i am facing a weird bug.
import pandas as pd
import requests
from requests_futures.sessions import FuturesSession
import os
import pathlib
class AsyncDownloader:
"""Download files asynchronously"""
__urls = set()
__dest_path = None
__user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
__read_timeout = 60
__connection_timeout = 30
def setSourceCSV(self, source_path, column_name):
self.source_path = source_path
self.column_name = column_name
try:
my_csv = pd.read_csv(source_path, usecols=[self.column_name], chunksize=10)
except ValueError:
print("The column name doesn't exist")
return
else:
# No exception whatsoever
for chunk in my_csv:
AsyncDownloader.__urls.update(set(getattr(chunk, self.column_name)))
def setDestinationPath(self, dest_path):
if dest_path.endswith('/'):
dest_path = dest_path[:-1]
self.dest_path = dest_path
# Make directory if not exist
# TODO Add exception in case we can't create the directory
pathlib.Path(self.dest_path).mkdir(parents=True, exist_ok=True)
if os.access(self.dest_path, os.W_OK):
AsyncDownloader.__dest_path = pathlib.Path(self.dest_path).resolve()
def setUserAgent(self, useragent):
self.useragent = useragent
AsyncDownloader.__user_agent = self.useragent
def setConnectionTimeout(self, ctimeout_secs):
self.timeout_secs = ctimeout_secs
AsyncDownloader.__connection_timeout = self.timeout_secs
def setReadTimeout(self, rtimeout_secs):
self.timeout_secs = rtimeout_secs
AsyncDownloader.__read_timeout = self.timeout_secs
def download(self):
try:
session = FuturesSession(max_workers=10)
session.headers.update({'user-agent': AsyncDownloader.__user_agent})
session.request(AsyncDownloader.__connection_timeout,
AsyncDownloader.__connection_timeout)
results = []
for url in AsyncDownloader.__urls:
results.append(session.get(url))
for result in results:
response = result.result()
filename = os.path.basename(response.url)
if AsyncDownloader.__dest_path is None:
AsyncDownloader.__dest_path = pathlib.Path(filename)
else:
AsyncDownloader.__dest_path = pathlib.Path(str(AsyncDownloader.__dest_path) + os.path.sep + filename).resolve()
# save file in directory
print(AsyncDownloader.__dest_path) # Shows correct path
with open(AsyncDownloader.__dest_path, 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
except requests.exceptions.HTTPError as errh:
print("Http Error:", errh)
except requests.exceptions.ConnectionError as errc:
print("Error Connecting:", errc)
except requests.exceptions.Timeout as errt:
print("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
print("OOps: Something Else", err)
else:
return
def printURLs(self):
print(AsyncDownloader.__urls)
The print shows the correct path which is
C:\Users\XYZ\PycharmProjects\AsyncDownloaderTest\images\Spring-Landscape-HD-Wallpapers-25912.jpg
However open sees the wrong path
with open(AsyncDownloader.__dest_path, 'wb') as fd:
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\XYZ\\PycharmProjects\\AsyncDownloaderTest\\images\\Spring-Landscape-HD-Wallpapers-25912.jpg\\FUE7XiFApEqWZQ85wYcAfM.jpg'`
I think the identation is OK so I wonder what's wrong.
Change:
AsyncDownloader.__dest_path = pathlib.Path(str(AsyncDownloader.__dest_path)
+ os.path.sep + filename).resolve()
to:
AsyncDownloader.__dest_path = pathlib.Path(
os.path.split(str(AsyncDownloader.__dest_path))[0] + os.path.sep + filename).resolve()
This adds the new file name to directory instead to the full path name of the previous file.
Change the following line
AsyncDownloader.__dest_path = pathlib.Path(str(AsyncDownloader.__dest_path)
+ os.path.sep + filename).resolve()
to:
AsyncDownloader.__dest_path = pathlib.Path(os.path.join(os.path.dirname(AsyncDownloader.__dest_path), filename)).resolve()
Related
Hello I am running a script where we get data from certain websites and put them in a database using scrapy, the script runs well but in log text file it shows ''DEBUG: Starting new HTTPS connection (1): 1.rome.api.flipkart.com:443''
what can be the real problem here, any help is appreciated, here is the code
import os
import sys, getopt
import time
import datetime
import pytz
import mysql.connector
import configparser
import shutil
import time
import concurrent.futures
import pandas as pd
currentdir = os.path.dirname(os.path.realpath(__file__))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)
from datetime import datetime
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
from multiprocessing import Process
from db.db_action import DBAction
from utils import utils
from concurrent.futures import ThreadPoolExecutor, as_completed
tz = pytz.timezone("Asia/Kolkata")
crawl_inputs = dict()
crawl_inputs["env"] = "prod"
crawl_inputs["marketplace"] = "Amazon"
crawl_inputs["site"] = "amz_kw"
crawl_inputs["db_name"] = "asian_paints"
crawl_inputs["pf_id"] = "1"
crawl_inputs["location_search"] = "0"
crawl_inputs["limit"] = ""
crawl_inputs["page"] = 1
crawl_inputs["kw_snapshot"] = 0
crawl_inputs["pdp_snapshot"] = 0
crawl_inputs["quick_search"] = 1
crawl_inputs["client_id"] = 1241
crawl_inputs["client"] = "asian_paints"
crawl_inputs["start_time"] = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
db_action = DBAction()
actual_count = 0
result_count = 0
archived_path = "C:/archives/"
connection = None
cursor = None
# Directory Create
try:
if not os.path.exists(archived_path):
os.makedirs(archived_path)
if not os.path.exists(archived_path + datetime.now(tz).strftime("%Y%m%d")+"/"+ crawl_inputs["site"]):
os.makedirs(archived_path + datetime.now(tz).strftime("%Y%m%d")+"/"+ crawl_inputs["site"])
if not os.path.exists("C:/var/logs/" + crawl_inputs["site"]):
os.makedirs("C:/var/logs/" + crawl_inputs["site"])
shutil.move("C:/var/logs/" + crawl_inputs["site"], archived_path + datetime.now(tz).strftime("%Y%m%d"), copy_function = shutil.copytree)
except Exception as e:
print(e)
print("File creation error: {0}:{1}".format(e.errno, e.strerror))
try:
if os.name == "nt":
log_path = "C:/var/logs"
base_dir = log_path+"/"+crawl_inputs["site"]
if not os.path.exists(base_dir):
os.makedirs(base_dir)
else:
log_path = "/var/logs/"
base_dir = log_path+"/"+crawl_inputs["site"]
if not os.path.exists(base_dir):
os.makedirs(base_dir)
directories = ["output", "run_log", "webpages"]
for directory in directories:
if not os.path.exists(base_dir+"/"+directory):
os.makedirs(base_dir+"/"+directory)
except OSError as oserr:
print("OS error occurred trying to open. Aborting.. Error{0}:{1}".format(oserr.errno, oserr.strerror))
sys.exit(1)
except IOError as ioerr:
print("I/O Error{0}: {1}".format(ioerr.errno, ioerr.strerror))
sys.exit(1)
except FileNotFoundError as fnfe:
print("File not found. Aborting.. Error: {0}:{1}".format(fnfe.errno, fnfe.strerror))
sys.exit(1)
except Exception as e:
print("File creation Error. Aborting.. Error: {0}:{1}".format(e.errno, e.strerror))
sys.exit(1)
crawl_inputs = db_action.get_kw_platform_inputs(crawl_inputs)
print(f"Total Executing Inputs : {len(crawl_inputs['inputs'])}")
print("Crawl ID: {0}".format(crawl_inputs["crawl_id"]))
def start(input):
pf_id = str(input["pf_id"])
keyword = str(input["keyword"])
brand_id = str(input["brand_id"])
brand_name = str(input["brand_name"])
keyword_id = str(input["keyword_id"])
location_id = str(input["location_id"])
location = str(input["location"])
pincode = str(input["pincode"])
location_search = str(crawl_inputs["location_search"])
env = str(crawl_inputs["env"])
db_name = str(crawl_inputs["db_name"])
crawl_id = str(crawl_inputs["crawl_id"])
site = str(crawl_inputs["site"])
page = str(crawl_inputs["page"])
command = 'python ' +currentdir+ '/main_kw.py --env="' +env+ '" --db_name="' +db_name+ '" --crawl_id="' +crawl_id+ '" --site="' +site+ '" --pf_id="'+pf_id+ '" --brand_id="' +brand_id+ '" --brand_name="' +brand_name+ '" --keyword_id="' +keyword_id+ '" --keyword="' +keyword+ '" --location_id="' +location_id+ '" --location="' +location+ '" --pincode="' +str(pincode)+ '" --page="' +page+ '" --location_search="' +location_search+ '"'
print("Executing Input :{0}".format(command))
os.system(command)
def runner():
threads = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
for input in crawl_inputs.get("inputs"):
print(f"Input: {input}")
task = executor.submit(start, input)
threads.append(task)
for task in concurrent.futures.as_completed(threads):
print(task.result())
runner()
time.sleep(5)
crawl_inputs["finish_time"] = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
connection = db_action.db_connection(db_name=crawl_inputs["db_name"], env=crawl_inputs["env"])
cursor = connection.cursor()
try:
cursor.execute("update `amazon_crawl_kw` set `status` = 0 where `crawl_id` != " +str(crawl_inputs["crawl_id"])+ ";")
cursor.execute("select count(kw_crawl_data_id) as product_count from `amazon_crawl_kw` where status = 1 and pf_id = "+str(crawl_inputs["pf_id"])+ " and crawl_id=" +str(crawl_inputs["crawl_id"])+ ";")
row = cursor.fetchone()
print("row value: "+ str(row))
result_count = row["product_count"]
print("Crawled row count : " + str(result_count))
try:
sql2 = 'UPDATE rb_crawl SET status=1, end_time = "' +str(crawl_inputs["finish_time"])+ '" ,no_of_sku_parsed='+ str(result_count)+ ' WHERE crawl_id=' + str(crawl_inputs["crawl_id"])
cursor.execute(sql2)
except Exception as e:
print("The following exception occured while updating : "+ str(e))
sql3 = 'UPDATE rb_platform SET kw_crawl_data_date = "'+ str(crawl_inputs["start_time"]) + '", kw_crawl_data_id = ' +str(crawl_inputs["crawl_id"])+ ' WHERE pf_id = ' + str(crawl_inputs["pf_id"])
cursor.execute(sql3)
connection.commit()
connection.close()
print("Updated rb_platform successfully")
except Exception as e:
print("Updating crawl id failed with exception as :" + str(e))
# try:
# items_count = result_count
# subject = crawl_inputs["site"] +" crawling completed"
# body = "Hi Team,<br><br>" +crawl_inputs["site"]+ " crawling successfully completed for the plateform " +crawl_inputs["marketplace"]+ "...<br>Platform Id: " +str(crawl_inputs["pf_id"])+ "<br>Crawl Id: " +str(crawl_inputs["crawl_id"])+ "<br>Total crawled items: " +str(items_count)+ " <br>Total Actual Items: " + str(actual_count) +" <br>Please QC the data value..<br><br>Thanks<br>Trailytics Team"
# utils.send_mail("no-reply#trailytics.com", "vijay.kothawar#trailytics.com;ashish.rawat#trailytics.com;anirudh.varshney#trailytics.com;ashutosh.shukla#trailytics.com", subject, body)
# print("Crawling process has been completed")
# except Exception as e:
# print("Mail Sending error:" + str(e))
print("Finish")
I have this final main.py that combines every function I wrote separately, but I can't make it work, it actually returns the Success at the end but it actually does nothing nor in my local folders or MongoDB. The function is this one:
def gw2_etl(url):
def log_scrape(url):
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
response = requests.get(url=url, headers=HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
data = soup.find_all('script')[8]
dataString = data.text.rstrip()
logData = re.findall(r'{.*}', dataString)
try:
urlLines = url.split('/')
if len(urlLines) < 5:
bossName = urlLines[3]
elif len(urlLines) == 5:
bossName = urlLines[4]
except Exception as e:
return 'Error' + str(e)
tag = bossName.split('_')
bossTag = tag[1]
try:
# Wing_1
if bossTag == 'vg':
pathName = 'ETL\EXTRACT_00\Web Scraping\Boss_data\Wing_1\Valley_Guardian'
with open(f'{pathName}\{bossName}.json', 'w') as f:
for line in logData:
jsonFile = f.write(line)
return jsonFile
return log_scrape()
def store_data(jsonFile):
with open(jsonFile) as f:
data = json.load(f)
sp = jsonFile.split('\\')
posSp = sp[-1]
bossTag = posSp.split('_')
nameTag = bossTag[1]
if len(bossTag) > 2:
nameTag = bossTag[1]
elif len(bossTag) == 2:
tagSplit = nameTag.split('.')
nameTag = tagSplit[0]
# Players Data:
player_group = []
player_acc = []
player_names = []
player_classes = []
for player in data['players']:
player_group.append(player['group'])
player_acc.append(player['acc'])
player_names.append(player['name'])
player_classes.append(player['profession'])
try:
# Wing-1
if nameTag == 'vg':
# Create lists:
player_dps1 = []
player_dps2 = []
player_dps3 = []
# Phase_1
phase1 = data['phases'][1]['dpsStats']
phase1_time_raw = data['phases'][1]['duration']
phase1_time = round(phase1_time_raw/1000,1)
for dps in phase1:
dps1_raw = dps[0]
player_dps1.append(round(dps1_raw/phase1_time,2))
# Phase_2
phase2 = data['phases'][6]['dpsStats']
phase2_time_raw = data['phases'][6]['duration']
phase2_time = round(phase2_time_raw/1000,1)
for dps in phase2:
dps2_raw = dps[0]
player_dps2.append(round(dps2_raw/phase2_time,2))
# Phase_3
phase3 = data['phases'][12]['dpsStats']
phase3_time_raw = data['phases'][12]['duration']
phase3_time = round(phase3_time_raw/1000,1)
for dps in phase3:
dps3_raw = dps[0]
player_dps3.append(round(dps3_raw/phase3_time,2))
stats_dict = {
'players':{
'group': player_group,
'account': player_acc,
'names': player_names,
'profession': player_classes,
'phase_1_dps': player_dps1,
'phase_2_dps': player_dps2,
'phase_3_dps': player_dps3
}
}
df = pd.DataFrame(stats_dict['players'], columns=['group','account','names','profession','phase_1_dps','phase_2_dps','phase_3_dps'])
return stats_dict
except Exception as e:
print('Error' + str(e))
sys.exit()
# JSON generator (MongoDB)
pathName = 'ETL\TRANSFORM_01\Players_info'
jsonString = json.dumps(stats_dict)
with open(f"{pathName}\{nameTag}_player_stats.json", 'w') as f:
f.write(jsonString)
# CSV generator (MySQL, PostgreSQL)
df.to_csv(f"{pathName}\{nameTag}_player_stats.csv",index=True)
return store_data()
def mongo_connect(stats_dict):
try:
client = pymongo.MongoClient('mongodb://localhost:27017/')
except Exception as e:
print('Connection could not be done' + str(e))
sys.exit()
db = client['GW2_SRS']
collection = db['players_info']
mongo_insert = collection.insert_one(stats_dict)
return mongo_connect()
return 'Success!'
pass
My goal is that, when I call gw2_etl(), it runs every process inside (log_scrape, store_data and mongo_connect) and returns the Success message at the end. I'm probably doing it wrong since it neither runs anything nor send an error message.
For the mongo connection, I need to return the stats_dict, since it is the JSON file that I want to upload there, csv file is just for local storage.
I actually got some bosses out since the code it's actually pretty long.
If you have any hint or clue about how could I make this work, I would be incredibly grateful.
You still need to call all of those functions separately from within the gw2_etl() before returning from the function. Defining functions inside another just means you can't access them outside of the outer function. So before the return statement add
log_scraper(url)
store_data(json_file)
mongo_connect(stats_dict)
and continue from there. You'll notice that you need to carry over some variables to invoke the functions with the correct arguments, but I left that part for you to figure out.
I was trying to download a full course from LinkedIn Learning using a
code from GitHub. I have already downloaded a couple of courses
before but this time, when I tried to download another course, the
error appeared.
PS: I do have a premium LinkedIn membership. Can't watch a course online all the time - that's why I download on my PC.
# -*- coding: utf-8 -*-
import requests
from requests import Session
from bs4 import BeautifulSoup
import urllib
import sys
import re
import os
import string
import config
import logging
reload(sys)
sys.setdefaultencoding('utf-8')
login_url = 'https://www.linkedin.com/'
post_login_url = 'https://www.linkedin.com/uas/login-submit'
course_api_url = 'https://www.linkedin.com/learning-api/detailedCourses??fields=fullCourseUnlocked,releasedOn,' \
'exerciseFileUrls,exerciseFiles&addParagraphsToTranscript=true&courseSlug=%s&q=slugs'
video_api_url = 'https://www.linkedin.com/learning-api/detailedCourses?addParagraphsToTranscript=false&courseSlug=%s' \
'&q=slugs&resolution=_720&videoSlug=%s'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/66.0.3359.181 Safari/537.36'
}
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
class Lld:
def __init__(self):
self.session = Session()
self.base_path = config.BASE_DOWNLOAD_PATH if config.BASE_DOWNLOAD_PATH else 'out'
#staticmethod
def plain_cookies(cookies):
plain = ''
for k, v in cookies.iteritems():
plain += k + '=' + v + '; '
return plain[:-2]
#staticmethod
def format_string(raw_string):
replacement_dict = {u'Ä': 'Ae', u'Ö': 'Oe', u'Ü': 'Ue', u'ä': 'ae', u'ö': 'oe', u'ü': 'ue', ':': ' -'}
invalid_chars = r'[^A-Za-z0-9\-\.]+'
u_map = {ord(key): unicode(val) for key, val in replacement_dict.items()}
raw_string = raw_string.translate(u_map)
raw_string = re.sub(invalid_chars, ' ', raw_string).strip().encode('utf-8')
i = 0
for c in raw_string:
if c in string.ascii_letters:
break
i += 1
return raw_string[i:]
#staticmethod
def format_time(ms):
seconds, milliseconds = divmod(ms, 1000)
minitues, seconds = divmod(seconds, 60)
hours, minitues = divmod(minitues, 60)
return '%d:%02d:%02d,%02d' % (hours, minitues, seconds, milliseconds)
def download_file(self, url, path, file_name):
resp = self.session.get(url, stream=True)
if not os.path.exists(path):
os.makedirs(path)
try:
with open(path + '/' + file_name, 'wb') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
except Exception as e:
os.remove(path + '/' + file_name)
print(e)
def download_sub(self, subs, path, file_name):
with open(path + '/' + file_name, 'a') as f:
i = 1
for sub in subs:
t_start = sub['transcriptStartAt']
if i == len(subs):
t_end = t_start + 5000
else:
t_end = subs[i]['transcriptStartAt']
caption = sub['caption']
f.write('%s\n' % str(i))
f.write('%s --> %s\n' % (self.format_time(t_start), self.format_time(t_end)))
f.write('%s\n\n' % caption)
i += 1
def download_desc(self, desc, course_url, path, file_name):
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + file_name, 'a') as f:
f.write('%s\n\n%s' % (desc, course_url))
def get_logged_session(self):
logging.info('Authenticating to LinkedIn')
login_page = BeautifulSoup(self.session.get(login_url).text, 'html.parser')
csrf = login_page.find(id='loginCsrfParam-login')['value']
logging.info('Csfr token: %s' % csrf)
login_data = urllib.urlencode(
{'session_key': config.USERNAME, 'session_password': config.PASSWORD, 'isJsEnabled': 'false',
'loginCsrfParam': csrf})
headers['Cookie'] = self.plain_cookies(requests.utils.dict_from_cookiejar(self.session.cookies))
self.session.headers.update(headers)
resp = self.session.post(post_login_url, data=login_data, allow_redirects=True)
if resp.status_code != 200:
logging.error('Could not authenticate to LinkedIn')
else:
logging.info('Authentication successfully completed')
def download_courses(self):
token = self.session.cookies.get('JSESSIONID').replace('"', '')
self.session.headers['Csrf-Token'] = token
self.session.headers['Cookie'] = self.plain_cookies(requests.utils.dict_from_cookiejar(self.session.cookies))
self.session.headers.pop('Accept')
for course in config.COURSES:
resp = self.session.get(course_api_url % course)
course_data = resp.json()['elements'][0]
course_name = self.format_string(course_data['title'])
logging.info('Starting download of course [%s]...' % course_name)
course_path = '%s/%s' % (self.base_path, course_name)
chapters_list = course_data['chapters']
chapter_index = 1
logging.info('Parsing course\'s chapters...')
logging.info('%d chapters found' % len(chapters_list))
for chapter in chapters_list:
chapter_name = self.format_string(chapter['title'])
logging.info('Starting download of chapter [%s]...' % chapter_name)
chapter_path = '%s/%s - %s' % (course_path, str(chapter_index).zfill(2), chapter_name)
if chapter_name == '':
chapter_path = chapter_path[:-3]
videos_list = chapter['videos']
video_index = 1
logging.info('Parsing chapters\'s videos')
logging.info('%d videos found' % len(videos_list))
for video in videos_list:
video_name = self.format_string(video['title'])
video_slug = video['slug']
video_data = (self.session.get(video_api_url % (course, video_slug)))
try:
video_url = re.search('"progressiveUrl":"(.+)","streamingUrl"', video_data.text).group(1)
except:
logging.error('Can\'t download the video [%s], probably is only for premium users' % video_name)
continue
logging.info('Downloading video [%s]' % video_name)
self.download_file(video_url, chapter_path, '%s - %s.mp4' % (str(video_index).zfill(2), video_name))
video_data = video_data.json()['elements'][0]
if config.SUBS:
try:
subs = video_data['selectedVideo']['transcript']['lines']
except KeyError:
logging.info('No subtitles avaible')
else:
logging.info('Downloading subtitles')
self.download_sub(subs, chapter_path, '%s - %s.srt' % (str(video_index).zfill(2), video_name))
video_index += 1
chapter_index += 1
exercises_list = course_data['exerciseFiles']
for exercise in exercises_list:
try:
ex_name = exercise['name']
ex_url = exercise['url']
except (KeyError, IndexError):
logging.info('Can\'t download an exercise file for course [%s]' % course_name)
else:
self.download_file(ex_url, course_path, ex_name)
description = course_data['description']
logging.info('Downloading course description')
self.download_desc(description, 'https://www.linkedin.com/learning/%s' % course, course_path, 'Description.txt')
def main():
lld = Lld()
lld.get_logged_session()
lld.download_courses()
if __name__ == '__main__':
main()
Error that appears
Traceback (most recent call last):
File "lld.py", line 187, in <module>
main()
File "lld.py", line 182, in main
lld.get_logged_session()
File "lld.py", line 104, in get_logged_session
csrf = login_page.find(id='loginCsrfParam-login')['value']
TypeError: 'NoneType' object has no attribute '__getitem__'
This error means that login_page.find(id='loginCsrfParam-login')['value'] has a null value (a value of None).
This is probably because the file you are parsing with Beautiful Soup (a very slow parser btw, but good for learning) does not contain the requested tag, or that tag does not have the 'value' attribute
EDIT:
The reason you are getting this error is that there is no tag with the id "loginCsrfParam-login"
Here is a Diagram of Whats going on in the interpreter:
Page is fetched (www.linkedin.com/index.html), this page does not contain anything with the id of "loginCsrfParam-login".
BeautifulSoup parses the page for a tag with the
"loginCsrfParam-login", it doesn't find it, so it returns None.
You didn't try to write safe code, so you didn't check the return value of parse.
Python failed because you tried to refer to the member of an empty class
your login_page doesn't contain the id and therefore you are getting the error because the object whose value you are trying to find does not exist which means it is of 'NoneType'.
New in version 3.7 supports ThreadingHTTPServer as mentioned in doc
to run from command line we use
python -m http.server
but its still run normal HTTPServer, is there any way to enable via command line.
EDITED:
python 3.7 runs ThreadingHTTPServer by default, no argument necessary
Simple Python 2 HTTP Server with multi-threading and partial-content support
#!/usr/bin/env python2
# Standard library imports.
from SocketServer import ThreadingMixIn
import BaseHTTPServer
import SimpleHTTPServer
import sys
import json
import os
from os.path import (join, exists, dirname, abspath, isabs, sep, walk, splitext,
isdir, basename, expanduser, split, splitdrive)
from os import makedirs, unlink, getcwd, chdir, curdir, pardir, rename, fstat
from shutil import copyfileobj, copytree
import glob
from zipfile import ZipFile
from urlparse import urlparse, parse_qs
from urllib import urlopen, quote, unquote
from posixpath import normpath
from cStringIO import StringIO
import re
import ConfigParser
import cgi
import threading
import socket
import errno
DATA_DIR = getcwd() # join(expanduser('~'), APP_NAME)
class ThreadingHTTPServer(ThreadingMixIn, BaseHTTPServer.HTTPServer):
pass
class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
""" Handler to handle POST requests for actions.
"""
serve_path = DATA_DIR
def do_GET(self):
""" Overridden to handle HTTP Range requests. """
self.range_from, self.range_to = self._get_range_header()
if self.range_from is None:
# nothing to do here
return SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)
print 'range request', self.range_from, self.range_to
f = self.send_range_head()
if f:
self.copy_file_range(f, self.wfile)
f.close()
def copy_file_range(self, in_file, out_file):
""" Copy only the range in self.range_from/to. """
in_file.seek(self.range_from)
# Add 1 because the range is inclusive
bytes_to_copy = 1 + self.range_to - self.range_from
buf_length = 64*1024
bytes_copied = 0
while bytes_copied < bytes_to_copy:
read_buf = in_file.read(min(buf_length, bytes_to_copy-bytes_copied))
if len(read_buf) == 0:
break
out_file.write(read_buf)
bytes_copied += len(read_buf)
return bytes_copied
def send_range_head(self):
"""Common code for GET and HEAD commands.
This sends the response code and MIME headers.
Return value is either a file object (which has to be copied
to the outputfile by the caller unless the command was HEAD,
and must be closed by the caller under all circumstances), or
None, in which case the caller has nothing further to do.
"""
path = self.translate_path(self.path)
f = None
if isdir(path):
if not self.path.endswith('/'):
# redirect browser - doing basically what apache does
self.send_response(301)
self.send_header("Location", self.path + "/")
self.end_headers()
return None
for index in "index.html", "index.htm":
index = join(path, index)
if exists(index):
path = index
break
else:
return self.list_directory(path)
if not exists(path) and path.endswith('/data'):
# FIXME: Handle grits-like query with /data appended to path
# stupid grits
if exists(path[:-5]):
path = path[:-5]
ctype = self.guess_type(path)
try:
# Always read in binary mode. Opening files in text mode may cause
# newline translations, making the actual size of the content
# transmitted *less* than the content-length!
f = open(path, 'rb')
except IOError:
self.send_error(404, "File not found")
return None
if self.range_from is None:
self.send_response(200)
else:
self.send_response(206)
self.send_header("Content-type", ctype)
fs = fstat(f.fileno())
file_size = fs.st_size
if self.range_from is not None:
if self.range_to is None or self.range_to >= file_size:
self.range_to = file_size-1
self.send_header("Content-Range",
"bytes %d-%d/%d" % (self.range_from,
self.range_to,
file_size))
# Add 1 because ranges are inclusive
self.send_header("Content-Length",
(1 + self.range_to - self.range_from))
else:
self.send_header("Content-Length", str(file_size))
self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
self.end_headers()
return f
def list_directory(self, path):
"""Helper to produce a directory listing (absent index.html).
Return value is either a file object, or None (indicating an
error). In either case, the headers are sent, making the
interface the same as for send_head().
"""
try:
list = os.listdir(path)
except os.error:
self.send_error(404, "No permission to list directory")
return None
list.sort(key=lambda a: a.lower())
f = StringIO()
displaypath = cgi.escape(unquote(self.path))
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
f.write("<html>\n<title>Directory listing for %s</title>\n" % displaypath)
f.write("<body>\n<h2>Directory listing for %s</h2>\n" % displaypath)
f.write("<hr>\n<ul>\n")
for name in list:
fullname = os.path.join(path, name)
displayname = linkname = name
# Append / for directories or # for symbolic links
if os.path.isdir(fullname):
displayname = name + "/"
linkname = name + "/"
if os.path.islink(fullname):
displayname = name + "#"
# Note: a link to a directory displays with # and links with /
f.write('<li>%s\n'
% (quote(linkname), cgi.escape(displayname)))
f.write("</ul>\n<hr>\n</body>\n</html>\n")
length = f.tell()
f.seek(0)
self.send_response(200)
encoding = sys.getfilesystemencoding()
self.send_header("Content-type", "text/html; charset=%s" % encoding)
self.send_header("Content-Length", str(length))
self.end_headers()
return f
def translate_path(self, path):
""" Override to handle redirects.
"""
path = path.split('?',1)[0]
path = path.split('#',1)[0]
path = normpath(unquote(path))
words = path.split('/')
words = filter(None, words)
path = self.serve_path
for word in words:
drive, word = splitdrive(word)
head, word = split(word)
if word in (curdir, pardir): continue
path = join(path, word)
return path
# Private interface ######################################################
def _get_range_header(self):
""" Returns request Range start and end if specified.
If Range header is not specified returns (None, None)
"""
range_header = self.headers.getheader("Range")
if range_header is None:
return (None, None)
if not range_header.startswith("bytes="):
print "Not implemented: parsing header Range: %s" % range_header
return (None, None)
regex = re.compile(r"^bytes=(\d+)\-(\d+)?")
rangething = regex.search(range_header)
if rangething:
from_val = int(rangething.group(1))
if rangething.group(2) is not None:
return (from_val, int(rangething.group(2)))
else:
return (from_val, None)
else:
print 'CANNOT PARSE RANGE HEADER:', range_header
return (None, None)
def get_server(port=8000, next_attempts=0, serve_path=None):
Handler = RequestHandler
if serve_path:
Handler.serve_path = serve_path
while next_attempts >= 0:
try:
httpd = ThreadingHTTPServer(("", port), Handler)
return httpd
except socket.error as e:
if e.errno == errno.EADDRINUSE:
next_attempts -= 1
port += 1
else:
raise
def main(args=None):
if args is None:
args = sys.argv[1:]
PORT = 8000
if len(args)>0:
PORT = int(args[-1])
serve_path = DATA_DIR
if len(args) > 1:
serve_path = abspath(args[-2])
httpd = get_server(port=PORT, serve_path=serve_path)
print "serving at port", PORT
httpd.serve_forever()
if __name__ == "__main__" :
main()
The function below receives file chunks from a web requests and assembles them. It works perfectly in Unix (OSX) but on Windows, it doesn't. Specifically, the file does assemble, however it always ends up too small, just a few KB. I cannot figure out what is causing this. No exceptions are raised, it all appears to work, except that the final file is not all there. I've included the entire function for context but I've marked the section which appears not to be working correctly. (Python 2.7 and Windows Server 2008 R2)
#view_config(route_name='upload', renderer='json')
def upload(request):
r = request.response
final_dir = 'w:\\foobar'
filename = request.params.get('flowFilename')
chunk_number = request.params.get('flowChunkNumber')
total_chunks = request.params.get('flowTotalChunks')
try:
temp_dir = os.path.join(final_dir, request.params.get('flowIdentifier'))
file_part = os.path.join(temp_dir, '%s.part.%s' % (filename, chunk_number))
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
except TypeError:
pass
if request.method == 'GET':
if file_part:
if os.path.isfile(file_part):
r.status = 200
else:
r.status = 404
return r
if request.POST:
try:
fo = request.params.get('file').file
f = open(file_part, 'wb')
f.write(fo.read())
f.close()
if chunk_number == total_chunks:
final_filename = os.path.join(final_dir, filename)
temp_filename = filename + '_INCOMPLETE'
#####################################################################
# This is where is appears to be going wrong...
final_file = open(temp_filename, 'a+b')
try:
for i in range(1, int(total_chunks) + 1):
ff = open(os.path.join(temp_dir, '%s.part.%s' % (filename, i)))
final_file.write(ff.read())
ff.close()
final_file.close()
os.rename(temp_filename, final_filename) # rename to final filename
shutil.rmtree(temp_dir) # clean up temp part files
except:
raise
####################################################################
r.status = 200
except Exception, e:
print 'ERROR', e.message
r.status = 404
return r