When I try to crawl thesis information in multiple threads, I cannot close the process after getting the information:
error
And when I comment the code which function is get the information from network, these processes can end normally.
normal
This error is trouble me and I don't have any idea, my network connect is by requests and set the response.close()
so can any handsome brother or beautiful lady help this confused person? Thanks
This is whole code:
my python is python 3.7
from multiprocessing import Process, Queue, Pool,Manager,Value
import time, random
import requests
import re
from bs4 import BeautifulSoup
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
'Connection': 'close'
}
## Just get the html text
def GetUrlInfo(url):
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
response.close()
SoupData = BeautifulSoup(response.text, 'lxml')
return SoupData
def GetVolumeUrlfromUrl(url:str)->str:
"""input is Journal's url and output is a link and a text description to each issue of the journal"""
url = re.sub('http:', 'https:', url)
SoupDataTemp = GetUrlInfo(url+'index.html')
SoupData = SoupDataTemp.find_all('li')
UrlALL = []
for i in SoupData:
if i.find('a') != None:
volumeUrlRule = '(.*?)'
volumeUrlTemp = re.findall(volumeUrlRule,str(i),re.I)
# u = i.find('a')['href']
# # print(u)
for u in volumeUrlTemp:
if re.findall(url, u[0]):
# print(u)
UrlALL.append((u[0], u[1]), )
# print(UrlALL)
return UrlALL
def GetPaperBaseInfoFromUrlAll(url:str)->str:
"""The input is the url and the output is all the paper information obtained from the web page,
including, doi, title, author, and the date about this volume """
soup = GetUrlInfo(url)
temp1 = soup.find_all('li',class_='entry article')
temp2= soup.find_all('h2')
temp2=re.sub('\\n',' ',temp2[1].text)
# print(temp2)
volumeYear = re.split(' ',temp2)[-1]
paper = []
for i in temp1:
if i.find('div',class_='head').find('a')== None:
paperDoi = ''
else:
paperDoi = i.find('div',class_='head').find('a')['href']
title = i.find('cite').find('span',class_='title').text[:-2]
paper.append([paperDoi,title])
return paper,volumeYear
# test start
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
# put the url into the query
def Write(query,value,num):
for count in range(num):
query.put(value[count][0],True)
# time.sleep(random.random())
print('write end')
# from the query get the url and get the paper info with this url
def Read(query,num,PaperInfo1,COUNT,i,paperNumber):
while True:
count = COUNT.get(True)
# print("before enter" + str(i) + ' - ' + str(count)+' - '+str(num))
COUNT.put(count, True)
if not query.empty():
value = query.get(True)
count = COUNT.get(True)
count = count + 1
COUNT.put(count,True)
paper, thisYear = GetPaperBaseInfoFromUrlAll(value) # just commented
print("connected " + str(i) + ' - ' + str(count) + ' - ' + str(num))
numb = paperNumber.get(True)
numb = numb + len(paper)
paperNumber.put(numb) # just commented
# print(paper,thisYear)
PaperInfo1.put((paper,thisYear),) # just commented
print("the process "+str(i)+' - '+ str(count)+ ' : '+value)
if not COUNT.empty():
count = COUNT.get(True)
# print("after enter" + str(i) + ' - ' + str(count) + ' - ' + str(num))
COUNT.put(count,True)
if int(count) == int(num):
print("the process "+str(i)+" end ")
break
print('read end')
# print the paper info
def GetPaperInfo(PaperInfo1,paperNumber):
for i in range(paperNumber.get(True)):
value = PaperInfo1.get(True)
print(value)
if __name__=='__main__':
r_num = 10 # th read process number
w_num = 1 # th write process number
w_cnt = UrlLen # the write counter
q = Queue(UrlLen) # the volune url queue
paperNumber = Queue(1) # the all paper number
COUNT = Queue(1) # the end tag
COUNT.put(int(0)) # first is zero
paperNumber.put(int(0)) # first is zero
PaperInfo1 = Queue()
r_list = [Process( target=Read, args=(q,w_cnt,PaperInfo1,COUNT,i,paperNumber) ) for i in range(r_num)]
w_list = [Process( target=Write, args=(q,UrlALL,w_cnt) )]
time_start = time.time()
[task.start() for task in w_list]
[task.start() for task in r_list]
[task.join() for task in w_list]
[task.join() for task in r_list]
time_used = time.time() - time_start
GetPaperInfo(PaperInfo1, paperNumber)
print('time_used:{}s'.format(time_used))
I have no idea, with debug the process finally enter the process.py -> row:297: try: self.run() and then enter the row:300: util._exit_function() and just a connected
the debug
but I dont know why the network can cause this error and how to solve this
that's all Thank you!
Hi,this is me again,I tried a concurrent implementation of threads,and global variables for threads are much more comfortable than process queue data sharing. By thread it does implement but my main function can't be stopped, previously with processes it was not possible to proceed to the next step when fetching concurrently, the fetching of data was implemented through threads and continued in the main function but the main function can't be stopped anymore. How interesting!
I have designed three functions similar to the previous ones.
GetUrlintoQueue is to write the fetched url UrlALL to the queue UrlQueue, UrlLen is the number of the url.
import threading
import queue
count = 0 # Record the number of times a value is fetched from the queue
paperNumber = 0 # Record the number of papers
def GetUrlintoQueue(UrlQueue,UrlALL,UrlLen):
for index in range(UrlLen):
UrlQueue.put(UrlALL[index][0], True)
print('Write End')
UrlQueue.task_done()
The other is GetPaperInfofromUrl. Get the url from the UrlQueue and write the information of the corresponding page to PaperInfo, index is the thread number.
def GetPaperInfofromUrl(UrlQueue,PaperInfo,index,UrlLen):
global count,paperNumber
while True:
if not UrlQueue.empty():
url = UrlQueue.get(True)
count = count + 1
paper, thisYear = GetPaperBaseInfoFromUrlAll(url) # just commented
print("connected " + str(index) + '-nd - ' + str(count) + ' - ' + str(UrlLen))
print(paper,thisYear)
paperNumber = paperNumber + len(paper)
PaperInfo.put((paper, thisYear), True)
if count == UrlLen:
print("the process " + str(index) + " end ")
break
UrlQueue.task_done()
PaperInfo.task_done()
print('the process ' + str(index) +' get paper info end')
GetPaperInfo is to show the results about PaperInfo, and it don't change.
def GetPaperInfo(PaperInfo,paperNumber):
for i in range(paperNumber):
value = PaperInfo.get(True)
print(value)
The main function first sets the corresponding variables, then writes directly first, then 10 threads crawl paper information, and finally shows the results, but after displaying the results still can not exit, I can not understand why.
if __name__ == '__main__':
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
UrlQueue = queue.Queue(UrlLen)
PaperInfo = queue.Queue(1000)
WriteThread = 1
ReadThread = 10
# url write
GetUrlThread = [threading.Thread(target=GetUrlintoQueue, args=(UrlQueue,UrlALL,UrlLen,))]
time_start = time.time()
[geturl.start() for geturl in GetUrlThread]
[geturl.join() for geturl in GetUrlThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# url write end
# paperinfo get
PaperinfoGetThread = [threading.Thread(target=GetPaperInfofromUrl, args=(UrlQueue,PaperInfo,index,UrlLen,)) for index in range(ReadThread)]
time_start = time.time()
[getpaper.start() for getpaper in PaperinfoGetThread]
[getpaper.join() for getpaper in PaperinfoGetThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# paperinfo get end
GetPaperInfo(PaperInfo,paperNumber) # show the results
import sys # it does not work
sys.exit()
The debug shows: debug.gif
(I dont have 10 reputation so the picture is the type of link. )
Here is how your process might look using concurrent.futures to manage all the threads and data transport. (not tested)
Adapting an example in the documentation.
from concurrent.futures import ThreadPoolExecutor
def GetPaperInfofromUrl(index,url):
paper, thisYear = GetPaperBaseInfoFromUrlAll(url)
return (index,url,paper,thisYear)
if __name__ == "__main__":
url = 'http://dblp.uni-trier.de/db/journals/talg/'
urls,descr = zip(*GetVolumeUrlfromUrl(url))
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
futs = [executor.submit(GetPaperInfofromUrl, index,url) for index,url in enumerate(urls)]
for future in concurrent.futures.as_completed(futs):
results.append(future.result())
GetPaperInfofromUrl seems superfluous, you could probably refactor GetPaperBaseInfoFromUrlAll and avoid a function call.
Related
I have this final main.py that combines every function I wrote separately, but I can't make it work, it actually returns the Success at the end but it actually does nothing nor in my local folders or MongoDB. The function is this one:
def gw2_etl(url):
def log_scrape(url):
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
response = requests.get(url=url, headers=HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
data = soup.find_all('script')[8]
dataString = data.text.rstrip()
logData = re.findall(r'{.*}', dataString)
try:
urlLines = url.split('/')
if len(urlLines) < 5:
bossName = urlLines[3]
elif len(urlLines) == 5:
bossName = urlLines[4]
except Exception as e:
return 'Error' + str(e)
tag = bossName.split('_')
bossTag = tag[1]
try:
# Wing_1
if bossTag == 'vg':
pathName = 'ETL\EXTRACT_00\Web Scraping\Boss_data\Wing_1\Valley_Guardian'
with open(f'{pathName}\{bossName}.json', 'w') as f:
for line in logData:
jsonFile = f.write(line)
return jsonFile
return log_scrape()
def store_data(jsonFile):
with open(jsonFile) as f:
data = json.load(f)
sp = jsonFile.split('\\')
posSp = sp[-1]
bossTag = posSp.split('_')
nameTag = bossTag[1]
if len(bossTag) > 2:
nameTag = bossTag[1]
elif len(bossTag) == 2:
tagSplit = nameTag.split('.')
nameTag = tagSplit[0]
# Players Data:
player_group = []
player_acc = []
player_names = []
player_classes = []
for player in data['players']:
player_group.append(player['group'])
player_acc.append(player['acc'])
player_names.append(player['name'])
player_classes.append(player['profession'])
try:
# Wing-1
if nameTag == 'vg':
# Create lists:
player_dps1 = []
player_dps2 = []
player_dps3 = []
# Phase_1
phase1 = data['phases'][1]['dpsStats']
phase1_time_raw = data['phases'][1]['duration']
phase1_time = round(phase1_time_raw/1000,1)
for dps in phase1:
dps1_raw = dps[0]
player_dps1.append(round(dps1_raw/phase1_time,2))
# Phase_2
phase2 = data['phases'][6]['dpsStats']
phase2_time_raw = data['phases'][6]['duration']
phase2_time = round(phase2_time_raw/1000,1)
for dps in phase2:
dps2_raw = dps[0]
player_dps2.append(round(dps2_raw/phase2_time,2))
# Phase_3
phase3 = data['phases'][12]['dpsStats']
phase3_time_raw = data['phases'][12]['duration']
phase3_time = round(phase3_time_raw/1000,1)
for dps in phase3:
dps3_raw = dps[0]
player_dps3.append(round(dps3_raw/phase3_time,2))
stats_dict = {
'players':{
'group': player_group,
'account': player_acc,
'names': player_names,
'profession': player_classes,
'phase_1_dps': player_dps1,
'phase_2_dps': player_dps2,
'phase_3_dps': player_dps3
}
}
df = pd.DataFrame(stats_dict['players'], columns=['group','account','names','profession','phase_1_dps','phase_2_dps','phase_3_dps'])
return stats_dict
except Exception as e:
print('Error' + str(e))
sys.exit()
# JSON generator (MongoDB)
pathName = 'ETL\TRANSFORM_01\Players_info'
jsonString = json.dumps(stats_dict)
with open(f"{pathName}\{nameTag}_player_stats.json", 'w') as f:
f.write(jsonString)
# CSV generator (MySQL, PostgreSQL)
df.to_csv(f"{pathName}\{nameTag}_player_stats.csv",index=True)
return store_data()
def mongo_connect(stats_dict):
try:
client = pymongo.MongoClient('mongodb://localhost:27017/')
except Exception as e:
print('Connection could not be done' + str(e))
sys.exit()
db = client['GW2_SRS']
collection = db['players_info']
mongo_insert = collection.insert_one(stats_dict)
return mongo_connect()
return 'Success!'
pass
My goal is that, when I call gw2_etl(), it runs every process inside (log_scrape, store_data and mongo_connect) and returns the Success message at the end. I'm probably doing it wrong since it neither runs anything nor send an error message.
For the mongo connection, I need to return the stats_dict, since it is the JSON file that I want to upload there, csv file is just for local storage.
I actually got some bosses out since the code it's actually pretty long.
If you have any hint or clue about how could I make this work, I would be incredibly grateful.
You still need to call all of those functions separately from within the gw2_etl() before returning from the function. Defining functions inside another just means you can't access them outside of the outer function. So before the return statement add
log_scraper(url)
store_data(json_file)
mongo_connect(stats_dict)
and continue from there. You'll notice that you need to carry over some variables to invoke the functions with the correct arguments, but I left that part for you to figure out.
I have a python scraper main purpose
Read list of postcodes from text to an array
for each postcode in array
search 10 pages
pull out certain content.
i seem to be getting the results like:
page 1
page 2
page 2
page 3
page 3
page 3
page 4
page 4
page 4
page 4
etc
i have tried re arranging the code several times without much look, everything works fine expcept this step
from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv
print(" Initializing ...")
print(" Loading Keywords")
with open("pcodes.txt") as pcodes:
postkeys = []
for line in pcodes:
postkeys.append(line.strip())
with open("pcodnum.txt") as pcodnum:
postkeynum = []
for line in pcodnum:
postkeynum.append(line.strip())
print(" Welcome to YellScrape v1.0")
print(" You ar searching yell.com ")
comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open(datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
data_list = []
for x in postkeys:
print(" Searching " + x + " for " + comtype + " companies")
for y in postkeynum:
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
data_list.append(url)
for item in data_list:
site = requests.get(item, headers=headers)
soup = BeautifulSoup(site.content, 'html.parser')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
listinnum += 1
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.select_one('span.business--telephoneNumber')
if busnumber is None:
busnumber = 'None'
else:
busnumber = busnumber.text
busadd = question.find('span', attrs={"itemprop": "streetAddress"})
if busadd is None:
busadd = 'None'
else:
busadd = busadd.text.replace(',',' ')
buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
if buslocal is None:
buslocal = 'None'
else:
buslocal = buslocal.text
buspost = question.find('span', attrs={"itemprop": "postalCode"})
if buspost is None:
buspost = 'None'
else:
buspost = buspost.text
busweb = question.find('a', attrs={"rel": "nofollow noopener"})
if busweb is None:
busweb = 'None'
else:
busweb = busweb.attrs['href']
print(busweb)
f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])
pagesnum += 1
print(" Finsihed Page " + str(y) + ". For " + x + " . " + str(listinnum) + " listings so far. Moving To Next Page")
print(" Waiting 30 seconds for security reasons.")
sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')
Expected result:
finished page 1
finished page 2
finished page 3
etc
It's because you are appending to your data list and then using a for loop to iterate through it after each time it's appending a new link.
So it's going to do requests for page 1, then requests for page 1 and requests for page 2, then page 1, 2 and 3, then page 1, 2, 3, and 4... etc.
So there's 2 ways to fix that. 1) don't append to the data_list and eliminate that all together, or 2) you can append to the data_list FIRST, and then loop through it (so separate the loop where you append to data_list and iterate through data_list.
I choose option 2)
from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv
print(" Initializing ...")
print(" Loading Keywords")
with open("C:/pcodes.txt") as pcodes:
postkeys = []
for line in pcodes:
postkeys.append(line.strip())
with open("C:/pcodnum.txt") as pcodnum:
postkeynum = []
for line in pcodnum:
postkeynum.append(line.strip())
print(" Welcome to YellScrape v1.0")
print(" You are searching yell.com ")
comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open('C:/'+datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
data_list = []
for x in postkeys:
print(" Searching " + x + " for " + comtype + " companies")
for y in postkeynum:
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
data_list.append(url)
# Now that you created a list of the urls, now you can loop through them
for item in data_list:
page = item.split('pageNum=')[-1].split('&')[0]
location = item[-5:]
site = requests.get(item, headers=headers)
soup = BeautifulSoup(site.content, 'html.parser')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
listinnum += 1
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.select_one('span.business--telephoneNumber')
if busnumber is None:
busnumber = 'None'
else:
busnumber = busnumber.text
busadd = question.find('span', attrs={"itemprop": "streetAddress"})
if busadd is None:
busadd = 'None'
else:
busadd = busadd.text.replace(',',' ')
buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
if buslocal is None:
buslocal = 'None'
else:
buslocal = buslocal.text
buspost = question.find('span', attrs={"itemprop": "postalCode"})
if buspost is None:
buspost = 'None'
else:
buspost = buspost.text
busweb = question.find('a', attrs={"rel": "nofollow noopener"})
if busweb is None:
busweb = 'None'
else:
busweb = busweb.attrs['href']
print(busweb)
f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])
pagesnum += 1
print(" Finished Page " + page + ". For " + location + " . " + str(listinnum) + " listings so far. Moving To Next Page")
if item != data_list[-1]:
print(" Waiting 30 seconds for security reasons.")
sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')
Initialize pageNum inside for loop:
for x in postkeys:
pageNum = 1
Increment pageNum side for loop and format URL
for item in data_list:
#format website url
url = "https://www.yell.com/ucs/UcsSearchAction.do?keywords={}&pageNum={}&location={}".format(comtype, pageNum, x)
site = requests.get(url, headers=headers)
# check response status code:
if site.status_code != 200:
break
pageNum += 1
You should remove this for loop:
for y in postkeynum:
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
data_list.append(url)
I am learning python crawler these days, and I write a simple crawler to get the picture on the Pixiv by Pixiv ID.
It works quite well, but here comes a big problem: When it is running, it takes up nearly 1.2G memory on my computer.
However, sometimes it just takes up just 10M memory, I really don't know which code causes such big usage of memory.
I have uploaded the script to my VPS(Only 768M memory Vulter server) and tried to run. As a result, I get a MerroyError.
So I wonder how to optimize the memory usage(even if taking more time to run).
Here is my code:
(I have rewrote all the code to make it pass pep8, if still unclear, please tell me which code makes you confused.)
from lxml import etree
import re
import os
import requests
# Get a single Picture.
def get_single(Pixiv_ID, Tag_img_src, Headers):
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p0")
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Picture_Type = [".png", ".jpg", ".gif"]
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\
+ Posttime+Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"\
+ str(Pixiv_ID) + "_p0"\
+ Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Get manga which is a bundle of pictures.
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers):
os.mkdir("./pic/" + str(Pixiv_ID))
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p")
Manga_URL = "http://www.pixiv.net/"+Tag_a_href
Manga_HTML = requests.get(Manga_URL, headers=Headers)
Manga_XML = etree.HTML(Manga_HTML.content)
Manga_Pages = Manga_XML.xpath('/html/body'
'/nav[#class="page-menu"]'
'/div[#class="page"]'
'/span[#class="total"]/text()')[0]
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Manga_HTML.close()
Picture_Type = [".png", ".jpg", ".gif"]
for Number in range(int(Manga_Pages)):
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + \
".pixiv.net/img-original/"\
+ Posttime + str(Number) + Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"+str(Pixiv_ID) + "/"\
+ str(Pixiv_ID) + "_p"\
+ str(Number) + Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Main function.
def get_pic(Pixiv_ID):
Index_URL = "http://www.pixiv.net/member_illust.php?"\
"mode=medium&illust_id="+str(Pixiv_ID)
Headers = {'referer': Index_URL}
Index_HTML = requests.get(Index_URL, headers=Headers, stream=True)
if Index_HTML.status_code != 200:
return Index_HTML.status_code
Index_XML = etree.HTML(Index_HTML.content)
Tag_a_href_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/#href')
Tag_img_src_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/img/#src')
if Tag_a_href_List == [] or Tag_img_src_List == []:
return 404
else:
Tag_a_href = Tag_a_href_List[0]
Tag_img_src = Tag_img_src_List[0]
Index_HTML.close()
if Tag_a_href.find("manga") != -1:
return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers)
else:
return get_single(Pixiv_ID, Tag_img_src, Headers)
# Check whether the picture already exists.
def check_exist(Pixiv_ID):
if not os.path.isdir("Pic"):
os.mkdir("Pic")
if os.path.isdir("./Pic/"+str(Pixiv_ID)):
return True
Picture_Type = [".png", ".jpg", ".gif"]
Picture_Exist = False
for i in range(len(Picture_Type)):
Path = "./Pic/" + str(Pixiv_ID)\
+ "_p0" + Picture_Type[i]
if os.path.isfile(Path):
return True
return Picture_Exist
# The script starts here.
for i in range(0, 38849402):
Pixiv_ID = 38849402-i
Picture_Exist = check_exist(Pixiv_ID)
if not Picture_Exist:
Return_Code = get_pic(Pixiv_ID)
if Return_Code == 200:
print str(Pixiv_ID), "finish!"
elif Return_Code == -1:
print str(Pixiv_ID), "got an unknown error."
elif Return_Code == 404:
print str(Pixiv_ID), "not found. Maybe deleted."
else:
print str(Pixiv_ID), "picture exists!"
OMG!
Finally, I know what goes wrong.
I use mem_top() to see what takes up the memory.
Guess what?
It is for i in range(0, 38849402):
In the memory, there is a list [0, 1, 2, 3 ... 38849401], which takes up my memory.
I change it to :
Pixiv_ID = 38849402
while Pixiv_ID > 0:
some code here
Pixiv_ID = Pixiv_ID-1
Now the memory usage is just no more than 20M.
Feeling excited!
Right now I am working on a python script which takes in a list of url's as an argument, then performs a GET request on each url and then searches through the output with xpath to fingerprint the website. It seems to work like a charm when the list is around 50 sites long, but anything after that causes the program to slow down to the point where it stop (usually around 150 sites). Scroll down to where you see main app logic and the relevant code it below. Right now I am just using 50 elements in the array and it works fine, but anything after makes the entire program stop. Any suggestions would be greatly appreciated!
#!/usr/bin/python
# Web Scraper
# 1.0
# Imports for file
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys
# Get Raw HTML
def scrape(url):
try:
page = requests.get(url, timeout=2.0)
if page.status_code == requests.codes.ok:
html_page = html.fromstring(page.content)
s =requests.session()
s.close()
return html_page
else:
s =requests.session()
s.close()
return False
except:
s =requests.session()
s.close()
return False
# Format URL
def format_url(url):
if url.find("http://") == -1:
url = "http://"+url
if url[-1] == "/":
url = url[:-1]
return url
# Check if WordPress Site
def check_wordpress(tree):
scripts = tree.xpath("//script[contains(#src,'wp-content')]")
if len(scripts) > 0:
return True
return False
# Check WordPress Version
def wordpress_version(tree):
type = tree.xpath("//meta[#name='generator']/#content")
version = 0
if len(type) > 0:
details = type[0].split()
if len(details)>1 and details[0] == "WordPress":
if len(details) > 1:
version = details[1]
else:
version = type[0]
return version
# Find Contact Page
def find_contact_page(tree):
contact = tree.xpath("//a[contains(text(),'Contact')]/#href")
try_xpath = 1
while len(contact) == 0:
if try_xpath == 1:
contact = tree.xpath("//span[contains(text(),'Contact')]/../#href")
elif try_xpath == 2:
contact = tree.xpath("//p[contains(text(),'Contact')]/../#href")
elif try_xpath == 3:
break
try_xpath+=1
if len(contact) > 0:
contact = contact[0]
if contact.find('#') == -1:
if contact[0] == '/':
contact = url + "" + contact
print contact
# Juicer method
def juice(url):
url = format_url(url)
string = url
tree = scrape(url)
if tree == False:
return string + " \t\t\t No XML tree"
elif check_wordpress(tree) == True:
version = wordpress_version(tree)
return string + " \t\t\t WordPress: " + str(version)
else:
return string + " \t\t\t Not WordPress"
# Main App Logic Below ------------------------------------->
# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')
# Juice url
def juice_url():
while True:
url = q.get()
result = juice(url)
print result
q.task_done()
# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
t = Thread(target=juice_url)
t.daemon = True
t.start()
# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
q.put(url)
q.join()
# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)
I'm writing a script to scrape data of the web. This creates lists to store the results of each page, which are then appended to one big list.
Everything is working fine and dandy until I try to do the final step by appending one list to the other. This is the code section in Question:
result = makeSearch(item)
#######################################################
#EVERYTHING IS STILL FINE WHEN YOU PRINT AT THIS POINT#
#######################################################
#printList(result)
##################################################
#APPENDING THE LIST TO THE LIST CREATES THE ERROR#
##################################################
adresses.append(result)
For example, this turns "Brückstr. 29" into "Brückstr.\xa029". I tried to remove it with string.replace('\\xa0', ' '), but to no avail. It doesn't do a thing.
I have a feeling that it has to do with the combination of numbers and characters, but that doesn't explain why it only happens when you try to append it to another list.
If you try to run my program, use Aachen or another German city for "Enter location: ".
This is the complete program:
import urllib.request
import time
import csv
from bs4 import BeautifulSoup
#Performs a HTTP-'POST' request, passes it to BeautifulSoup and returns the result
def doRequest(request):
requestResult = urllib.request.urlopen(request)
soup = BeautifulSoup(requestResult, from_encoding='iso-8859-1')
return soup
#Returns all the result links from the given search parameters
def getLinksFromSearch(location):
database = []
links_unsortiert = []
#The search parameters
params = {
'subject': 'Taxi',
'location': location,
#'distance': '-1',
#'execute': 'Suchen',
#'suggest_choose': 'on',
#'radial_check': 'on',
}
DATA = urllib.parse.urlencode(params)
DATA = DATA.encode('iso-8859-1')
request = urllib.request.Request(
"http://www.gelbeseiten.de/yp/search.yp?subject=Taxi&location=" + location,
DATA)
# adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
soup = doRequest(request)
for link in soup.find_all('a'):
database.append(link.get('href'))
for item in database:
if item.startswith("http://adresse.gelbeseiten.de/"):
links_unsortiert.append(item)
links = list(set(links_unsortiert))
return links
#Performs a search on the link results
def searchOnLinks(links):
adresses = []
i = 1
j = len(links)
print("Gathering information, please wait...")
for item in links:
print("(" , i , "/" , j , ") Making request...")
result = makeSearch(item)
########################################
#EVERYTHING IS STILL FINE AT THIS POINT#
########################################
printList(result)
##################################################
#APPENDING THE LIST TO THE LIST CREATES THE ERROR#
##################################################
adresses.append(result)
for elem in adresses:
for element in elem:
element = element.replace('\xa0', ' ')
i = i + 1
time.sleep(0.3)
print("All done.")
return adresses
def makeSearch(link):
request = urllib.request.Request(link)
#Adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
soup = doRequest(request)
name = ''
strasse = ''
plz = ''
stadt = ''
telefon = ''
mail = ''
url = ''
data = [
#'Name',
#'Straße',
#'PLZ',
#'Stadt',
#'Telefon',
#'E-Mail',
#'Homepage'
]
try:
fieldValue = soup.find(itemprop="name")
name = fieldValue.next_element
data.append(name)
except AttributeError:
print("Name not found!")
try:
fieldValue = soup.find(itemprop="streetAddress")
strasse = fieldValue.next_element
data.append(strasse)
except AttributeError:
print("Street not found!")
try:
fieldValue = soup.find(itemprop="postalCode")
plz = fieldValue.next_element
data.append(plz)
except AttributeError:
print("Zipcode not found!")
try:
fieldValue = soup.find(itemprop="addressLocality")
stadt = fieldValue.next_element
data.append(stadt)
except AttributeError:
print("City not found!")
return data
def printList(liste):
for element in liste:
print(element)
#The main input/output function
def inputOutput():
location = []
while True:
location = input("Enter location: ")
try:
links = getLinksFromSearch(location)
break
except urllib.error.HTTPError:
print("Error! Input raised an HTTP-Exception. Please enter valid input.")
#Checks if the search yielded any results
if len(links) > 0:
print("The search returned", len(links), "result(s).")
print('To proceed, enter "go".')
localVar = input('To do a new search, enter any key: ')
if localVar == 'go':
data = searchOnLinks(links)
printList(data)
saveData = input('Enter "save" if you want to save: ')
if saveData == 'save':
file_name = input("Save as: ")
print("Writing to file...")
with open(file_name + '.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(data)
else:
return
else:
return
else:
print("The search returned no results.")
#Program entry point
def main():
while True:
inputOutput()
inputVar = input('If you want to run the application again, enter "y". To exit, enter any key: ')
if inputVar != 'y':
break
main()
It turns out that when I save the Data to a .csv-file, the whitespaces are shown properly, so nevermind then.