Calling an inner function in Python

Calling an inner function in Python - python

I have this final main.py that combines every function I wrote separately, but I can't make it work, it actually returns the Success at the end but it actually does nothing nor in my local folders or MongoDB. The function is this one:
def gw2_etl(url):
def log_scrape(url):
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
response = requests.get(url=url, headers=HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
data = soup.find_all('script')[8]
dataString = data.text.rstrip()
logData = re.findall(r'{.*}', dataString)
try:
urlLines = url.split('/')
if len(urlLines) < 5:
bossName = urlLines[3]
elif len(urlLines) == 5:
bossName = urlLines[4]
except Exception as e:
return 'Error' + str(e)
tag = bossName.split('_')
bossTag = tag[1]
try:
# Wing_1
if bossTag == 'vg':
pathName = 'ETL\EXTRACT_00\Web Scraping\Boss_data\Wing_1\Valley_Guardian'
with open(f'{pathName}\{bossName}.json', 'w') as f:
for line in logData:
jsonFile = f.write(line)
return jsonFile
return log_scrape()
def store_data(jsonFile):
with open(jsonFile) as f:
data = json.load(f)
sp = jsonFile.split('\\')
posSp = sp[-1]
bossTag = posSp.split('_')
nameTag = bossTag[1]
if len(bossTag) > 2:
nameTag = bossTag[1]
elif len(bossTag) == 2:
tagSplit = nameTag.split('.')
nameTag = tagSplit[0]
# Players Data:
player_group = []
player_acc = []
player_names = []
player_classes = []
for player in data['players']:
player_group.append(player['group'])
player_acc.append(player['acc'])
player_names.append(player['name'])
player_classes.append(player['profession'])
try:
# Wing-1
if nameTag == 'vg':
# Create lists:
player_dps1 = []
player_dps2 = []
player_dps3 = []
# Phase_1
phase1 = data['phases'][1]['dpsStats']
phase1_time_raw = data['phases'][1]['duration']
phase1_time = round(phase1_time_raw/1000,1)
for dps in phase1:
dps1_raw = dps[0]
player_dps1.append(round(dps1_raw/phase1_time,2))
# Phase_2
phase2 = data['phases'][6]['dpsStats']
phase2_time_raw = data['phases'][6]['duration']
phase2_time = round(phase2_time_raw/1000,1)
for dps in phase2:
dps2_raw = dps[0]
player_dps2.append(round(dps2_raw/phase2_time,2))
# Phase_3
phase3 = data['phases'][12]['dpsStats']
phase3_time_raw = data['phases'][12]['duration']
phase3_time = round(phase3_time_raw/1000,1)
for dps in phase3:
dps3_raw = dps[0]
player_dps3.append(round(dps3_raw/phase3_time,2))
stats_dict = {
'players':{
'group': player_group,
'account': player_acc,
'names': player_names,
'profession': player_classes,
'phase_1_dps': player_dps1,
'phase_2_dps': player_dps2,
'phase_3_dps': player_dps3
}
}
df = pd.DataFrame(stats_dict['players'], columns=['group','account','names','profession','phase_1_dps','phase_2_dps','phase_3_dps'])
return stats_dict
except Exception as e:
print('Error' + str(e))
sys.exit()
# JSON generator (MongoDB)
pathName = 'ETL\TRANSFORM_01\Players_info'
jsonString = json.dumps(stats_dict)
with open(f"{pathName}\{nameTag}_player_stats.json", 'w') as f:
f.write(jsonString)
# CSV generator (MySQL, PostgreSQL)
df.to_csv(f"{pathName}\{nameTag}_player_stats.csv",index=True)
return store_data()
def mongo_connect(stats_dict):
try:
client = pymongo.MongoClient('mongodb://localhost:27017/')
except Exception as e:
print('Connection could not be done' + str(e))
sys.exit()
db = client['GW2_SRS']
collection = db['players_info']
mongo_insert = collection.insert_one(stats_dict)
return mongo_connect()
return 'Success!'
pass
My goal is that, when I call gw2_etl(), it runs every process inside (log_scrape, store_data and mongo_connect) and returns the Success message at the end. I'm probably doing it wrong since it neither runs anything nor send an error message.
For the mongo connection, I need to return the stats_dict, since it is the JSON file that I want to upload there, csv file is just for local storage.
I actually got some bosses out since the code it's actually pretty long.
If you have any hint or clue about how could I make this work, I would be incredibly grateful.

You still need to call all of those functions separately from within the gw2_etl() before returning from the function. Defining functions inside another just means you can't access them outside of the outer function. So before the return statement add
log_scraper(url)
store_data(json_file)
mongo_connect(stats_dict)
and continue from there. You'll notice that you need to carry over some variables to invoke the functions with the correct arguments, but I left that part for you to figure out.

Related

Value not appending to global array

I am trying to run a multithreaded email checker to see if the emails are office 365 valid.
Looking over and over my code, I cannot seem to find the reason it's not working correctly.
It should be appending the email to a GOOD or BAD list.
Instead, it's not appending anything!
This is my code:
...
currentDirectory = os.getcwd() # set the current directory - /new/
# Locations
location_emails_goods = currentDirectory + '/contacts/goods/'
location_emails_bads = currentDirectory + '/contacts/bads/'
location_emails = currentDirectory + '/contacts/contacts.txt'
now = datetime.now()
todayString = now.strftime('%d-%m-%Y-%H-%M-%S')
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
def saveLogs():
global GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY, file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS
#print(GOOD_EMAILS_ARRAY)
for good in GOOD_EMAILS_ARRAY:
file_goods.write(good + '\n')
file_goods.close()
for bad in BAD_EMAILS_ARRAY:
file_bads.write(bad + '\n')
file_bads.close()
def newChecker(email):
global url, GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
BAD_EMAILS_ARRAY.append(email)
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
BAD_EMAILS_ARRAY.append(email)
# The follow is showing empty array eventhough I have defined GOOD_EMAILS_ARRAY globally so it should be updating
print(GOOD_EMAILS_ARRAY)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()
As you can see, I am trying to append an email to either GOOD_EMAILS_ARRAY or BAD_EMAILS_ARRAY.
The BAD_EMAILS_ARRAY and GOOD_EMAILS_ARRAY are global variables but it for reason won't append to them.
I am running this through multiprocessing if you need to know.
Any ideas or errors looking in my code?

Processes do not share memory, the global variable with same name in two processes are two different object.
If you need share state between processes, see this:
https://docs.python.org/3/library/multiprocessing.html#sharing-state-between-processes

Okay so it turns out that I just needed to use the Manager from multiprocessing:
from multiprocessing import Manager, Pool
then I could use a normal array through the manager such as:
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
This allowed me to then use my script like it was, just using these new arrays by Manager which works just how I wanted :)
...
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
def saveLogs():
global file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS, good_list, bad_list
for good in good_list:
file_goods.write(good + '\n')
file_goods.close()
for bad in bad_list:
file_bads.write(bad + '\n')
file_bads.close()
print('{} => Fully completed email scanning'.format(Fore.CYAN))
print('{} => Good emails [{}] || Bad emails [{}]'.format(Fore.GREEN, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS))
def newChecker(email):
global url, good_list, bad_list
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
bad_list.append(email)
if valid:
good_list.append(email)
else:
if valid:
good_list.append(email)
else:
bad_list.append(email)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()

Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel

So I have created a web scraper with selenium that infinitely crawls a web page. I am trying to create two instances of this scraper and run them in parallel so that two different portions of the site (or two different sites entirely) will be scraped at the same time. With my current code, both processes start and two chrome instances launch, but only one actually starts scraping. The other just sits on the landing page and never moves. My current scraper class looks like this
class clBot(Scraper):
def __init__(self, light_or_dark):
light_side_xpaths = ['//*[#id="hhh"]/h4/a', '//*[#id="sss"]/h4/a/', '//*[#id="jjj"]/h4/a',
'//*[#id="bbb"]/h4/a', '//*[#id="ggg"]/h4/a']
dark_side_xpaths = ['//*[#id="ccc"]/h4/a', '//*[#id="ppp"]/h4', '//*[#id="forums"]/h4/a']
if light_or_dark == "light":
self.xpaths_to_scrape = light_side_xpaths
self.csv_file = "lightside.csv"
elif light_or_dark == "dark":
self.xpaths_to_scrape = dark_side_xpaths
self.csv_file = "darkside.csv"
else:
print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
quit()
self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
self.options = webdriver.ChromeOptions()
#self.options.add_argument('--headless')
self.options.add_argument('user-agent={self.user_agent}')
self.current_region = ''
self.driver = webdriver.Chrome(chrome_options=self.options)
self.driver.get('https://craigslist.org')
def run(self):
self.navigate_pages()
def identify_phone_number(self, string, phone_number_list):
reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
if len(reg) > 0:
for r in reg:
if r.strip() not in phone_number_list:
with open(self.csv_file, 'a') as csv:
csv.write("{}\n".format(r.strip()))
print("Extracted {} from listing".format(r.strip()))
else:
print('Phone number already in list.')
def extract_phone_number(self):
try:
with open(self.csv_file, 'r') as csv:
current_phone_numbers = csv.read()
posting_body = self.driver.find_element_by_id('postingbody')
self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
contact_info = self.driver.find_element_by_class_name('showcontact')
contact_info.click()
time.sleep(1)
self.identify_phone_number(posting_body.text, current_phone_numbers)
except TimeoutException:
self.identify_phone_number(posting_body.text, current_phone_numbers)
print('There is no phone number in this listing.')
def scrape_pages(self):
i=1
while True:
try:
self.scraper_wait_class_until_all(self.driver, 'result-row')
results = self.driver.find_elements_by_class_name('result-row')
print("clicking result {}".format(i))
results[i].find_element_by_class_name('result-title').click()
self.extract_phone_number()
self.driver.back()
i+=1
except IndexError:
self.scraper_wait_xpath_until_any(self.driver, '//*[#id="searchform"]/div[5]/div[3]/span[2]/a[3]')
next_button = self.driver.find_element_by_xpath('//*[#id="searchform"]/div[5]/div[3]/span[2]/a[3]')
print('Navigating to next page.')
next_button.click()
i=1
def choose_xpath_to_scrape(self, list_of_xpaths):
xpath_index = randint(0, len(list_of_xpaths)-1)
xpath = list_of_xpaths[xpath_index]
return xpath
def navigate_pages(self):
try:
while True:
try:
self.scraper_wait_xpath_until_any(self.driver, '//*[#id="rightbar"]')
rightbar = self.driver.find_element_by_xpath('//*[#id="rightbar"]')
nearby_cl = rightbar.find_element_by_xpath('//*[#id="rightbar"]/ul/li[1]')
child_items = nearby_cl.find_elements_by_class_name('s')
random = randint(1, len(child_items)-1)
time.sleep(3)
print("Clicking {}".format(child_items[random].text))
child_items[random].click()
for xpath in self.xpaths_to_scrape:
area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
area_to_scrape.click()
self.scrape_pages()
self.driver.back()
time.sleep(1)
except WebDriverException:
continue
except Exception as e:
print(e)
return
finally:
self.driver.quit()
and the main.py file that opens the two processes and initializes them is as follows:
import scraper
from multiprocessing import Process, Manager
if __name__ == "__main__":
manager = Manager()
d = manager.dict()
l = manager.list(range(10))
darksideScraper = scraper.clBot('light')
lightsideScraper = scraper.clBot('dark')
darkside = Process(target=darksideScraper.navigate_pages())
lightside = Process(target=lightsideScraper.navigate_pages())
darkside.start()
lightside.start()
darkside.join()
lightside.join()
Any help would be appreciated!

Try passing your target as reference to your function instead of calling it, like this Process(target=darksideScraper.navigate_pages). Also refer to this, for another example of how to use multiprocessing.

How to optimize the memory usage of my python crawler

I am learning python crawler these days, and I write a simple crawler to get the picture on the Pixiv by Pixiv ID.
It works quite well, but here comes a big problem: When it is running, it takes up nearly 1.2G memory on my computer.
However, sometimes it just takes up just 10M memory, I really don't know which code causes such big usage of memory.
I have uploaded the script to my VPS(Only 768M memory Vulter server) and tried to run. As a result, I get a MerroyError.
So I wonder how to optimize the memory usage(even if taking more time to run).
Here is my code:
(I have rewrote all the code to make it pass pep8, if still unclear, please tell me which code makes you confused.)
from lxml import etree
import re
import os
import requests
# Get a single Picture.
def get_single(Pixiv_ID, Tag_img_src, Headers):
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p0")
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Picture_Type = [".png", ".jpg", ".gif"]
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\
+ Posttime+Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"\
+ str(Pixiv_ID) + "_p0"\
+ Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Get manga which is a bundle of pictures.
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers):
os.mkdir("./pic/" + str(Pixiv_ID))
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p")
Manga_URL = "http://www.pixiv.net/"+Tag_a_href
Manga_HTML = requests.get(Manga_URL, headers=Headers)
Manga_XML = etree.HTML(Manga_HTML.content)
Manga_Pages = Manga_XML.xpath('/html/body'
'/nav[#class="page-menu"]'
'/div[#class="page"]'
'/span[#class="total"]/text()')[0]
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Manga_HTML.close()
Picture_Type = [".png", ".jpg", ".gif"]
for Number in range(int(Manga_Pages)):
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + \
".pixiv.net/img-original/"\
+ Posttime + str(Number) + Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"+str(Pixiv_ID) + "/"\
+ str(Pixiv_ID) + "_p"\
+ str(Number) + Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Main function.
def get_pic(Pixiv_ID):
Index_URL = "http://www.pixiv.net/member_illust.php?"\
"mode=medium&illust_id="+str(Pixiv_ID)
Headers = {'referer': Index_URL}
Index_HTML = requests.get(Index_URL, headers=Headers, stream=True)
if Index_HTML.status_code != 200:
return Index_HTML.status_code
Index_XML = etree.HTML(Index_HTML.content)
Tag_a_href_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/#href')
Tag_img_src_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/img/#src')
if Tag_a_href_List == [] or Tag_img_src_List == []:
return 404
else:
Tag_a_href = Tag_a_href_List[0]
Tag_img_src = Tag_img_src_List[0]
Index_HTML.close()
if Tag_a_href.find("manga") != -1:
return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers)
else:
return get_single(Pixiv_ID, Tag_img_src, Headers)
# Check whether the picture already exists.
def check_exist(Pixiv_ID):
if not os.path.isdir("Pic"):
os.mkdir("Pic")
if os.path.isdir("./Pic/"+str(Pixiv_ID)):
return True
Picture_Type = [".png", ".jpg", ".gif"]
Picture_Exist = False
for i in range(len(Picture_Type)):
Path = "./Pic/" + str(Pixiv_ID)\
+ "_p0" + Picture_Type[i]
if os.path.isfile(Path):
return True
return Picture_Exist
# The script starts here.
for i in range(0, 38849402):
Pixiv_ID = 38849402-i
Picture_Exist = check_exist(Pixiv_ID)
if not Picture_Exist:
Return_Code = get_pic(Pixiv_ID)
if Return_Code == 200:
print str(Pixiv_ID), "finish!"
elif Return_Code == -1:
print str(Pixiv_ID), "got an unknown error."
elif Return_Code == 404:
print str(Pixiv_ID), "not found. Maybe deleted."
else:
print str(Pixiv_ID), "picture exists!"

OMG!
Finally, I know what goes wrong.
I use mem_top() to see what takes up the memory.
Guess what?
It is for i in range(0, 38849402):
In the memory, there is a list [0, 1, 2, 3 ... 38849401], which takes up my memory.
I change it to :
Pixiv_ID = 38849402
while Pixiv_ID > 0:
some code here
Pixiv_ID = Pixiv_ID-1
Now the memory usage is just no more than 20M.
Feeling excited!

Use of files on hard drives instead of url with python

I would like to modify this script to use offline files, if I download the files from url works, but if the same file as I withdraw from hard drives, does not open, someone helps me to understand why and how to do, thank you.
def INDEX():
TVLIST('https://www.*********/playlist/*******/test.m3u')
def TVLIST(url):
try:
m3u = getHtml(url)
parsem3u(m3u)
except:
addDir('Nothing found', '', '', '', Folder=False)
xbmcplugin.endOfDirectory(int(sys.argv[1]))
urlopen = urllib2.urlopen
Request = urllib2.Request
def getHtml(url, referer=None, hdr=None, data=None):
if not hdr:
req = Request(url, data, headers)
else:
req = Request(url, data, hdr)
if referer:
req.add_header('Referer', referer)
if data:
req.add_header('Content-Length', len(data))
response = urlopen(req)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO( response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
f.close()
else:
data = response.read()
response.close()
return data
def parsem3u(html, sitechk=True):
match = re.compile('#.+,(.+?)\n(.+?)\n').findall(html)
txtfilter = txtfilter = GETFILTER()
txtfilter = txtfilter.split(',') if txtfilter else []
txtfilter = [f.lower().strip() for f in txtfilter]
i = 0
count = 0
for name, url in match:
status = ""
url = url.replace('\r','')
if not txtfilter or any(f in name.lower() for f in txtfilter):
if sitechk:
if i < 5:
try:
siteup = urllib.urlopen(url).getcode()
status = " [COLOR red]offline[/COLOR]" if siteup != 200 else " [COLOR green]online[/COLOR]"
except: status = " [COLOR red]offline[/COLOR]"
i += 1
addPlayLink(name+status, url, 3, uiptvicon)
count += 1
return count
I thought, was enough to put the local path
def INDEX():
TVLIST(r'c:\Desktop\IPTVLIST\M3U\playlist\test.m3u')
who explains why it does not work and how can I do? Thank you

As suggested by #languitar in the comments you would have file:// which of course it should work for windows, but moving to a platform like android, you have different file system there, you don't have C drive. So make sure you got an alternative location on the android.

Appending a list to another list replaces whitespaces with \xa0 in front of Integers

I'm writing a script to scrape data of the web. This creates lists to store the results of each page, which are then appended to one big list.
Everything is working fine and dandy until I try to do the final step by appending one list to the other. This is the code section in Question:
result = makeSearch(item)
#######################################################
#EVERYTHING IS STILL FINE WHEN YOU PRINT AT THIS POINT#
#######################################################
#printList(result)
##################################################
#APPENDING THE LIST TO THE LIST CREATES THE ERROR#
##################################################
adresses.append(result)
For example, this turns "Brückstr. 29" into "Brückstr.\xa029". I tried to remove it with string.replace('\\xa0', ' '), but to no avail. It doesn't do a thing.
I have a feeling that it has to do with the combination of numbers and characters, but that doesn't explain why it only happens when you try to append it to another list.
If you try to run my program, use Aachen or another German city for "Enter location: ".
This is the complete program:
import urllib.request
import time
import csv
from bs4 import BeautifulSoup
#Performs a HTTP-'POST' request, passes it to BeautifulSoup and returns the result
def doRequest(request):
requestResult = urllib.request.urlopen(request)
soup = BeautifulSoup(requestResult, from_encoding='iso-8859-1')
return soup
#Returns all the result links from the given search parameters
def getLinksFromSearch(location):
database = []
links_unsortiert = []
#The search parameters
params = {
'subject': 'Taxi',
'location': location,
#'distance': '-1',
#'execute': 'Suchen',
#'suggest_choose': 'on',
#'radial_check': 'on',
}
DATA = urllib.parse.urlencode(params)
DATA = DATA.encode('iso-8859-1')
request = urllib.request.Request(
"http://www.gelbeseiten.de/yp/search.yp?subject=Taxi&location=" + location,
DATA)
# adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
soup = doRequest(request)
for link in soup.find_all('a'):
database.append(link.get('href'))
for item in database:
if item.startswith("http://adresse.gelbeseiten.de/"):
links_unsortiert.append(item)
links = list(set(links_unsortiert))
return links
#Performs a search on the link results
def searchOnLinks(links):
adresses = []
i = 1
j = len(links)
print("Gathering information, please wait...")
for item in links:
print("(" , i , "/" , j , ") Making request...")
result = makeSearch(item)
########################################
#EVERYTHING IS STILL FINE AT THIS POINT#
########################################
printList(result)
##################################################
#APPENDING THE LIST TO THE LIST CREATES THE ERROR#
##################################################
adresses.append(result)
for elem in adresses:
for element in elem:
element = element.replace('\xa0', ' ')
i = i + 1
time.sleep(0.3)
print("All done.")
return adresses
def makeSearch(link):
request = urllib.request.Request(link)
#Adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
soup = doRequest(request)
name = ''
strasse = ''
plz = ''
stadt = ''
telefon = ''
mail = ''
url = ''
data = [
#'Name',
#'Straße',
#'PLZ',
#'Stadt',
#'Telefon',
#'E-Mail',
#'Homepage'
]
try:
fieldValue = soup.find(itemprop="name")
name = fieldValue.next_element
data.append(name)
except AttributeError:
print("Name not found!")
try:
fieldValue = soup.find(itemprop="streetAddress")
strasse = fieldValue.next_element
data.append(strasse)
except AttributeError:
print("Street not found!")
try:
fieldValue = soup.find(itemprop="postalCode")
plz = fieldValue.next_element
data.append(plz)
except AttributeError:
print("Zipcode not found!")
try:
fieldValue = soup.find(itemprop="addressLocality")
stadt = fieldValue.next_element
data.append(stadt)
except AttributeError:
print("City not found!")
return data
def printList(liste):
for element in liste:
print(element)
#The main input/output function
def inputOutput():
location = []
while True:
location = input("Enter location: ")
try:
links = getLinksFromSearch(location)
break
except urllib.error.HTTPError:
print("Error! Input raised an HTTP-Exception. Please enter valid input.")
#Checks if the search yielded any results
if len(links) > 0:
print("The search returned", len(links), "result(s).")
print('To proceed, enter "go".')
localVar = input('To do a new search, enter any key: ')
if localVar == 'go':
data = searchOnLinks(links)
printList(data)
saveData = input('Enter "save" if you want to save: ')
if saveData == 'save':
file_name = input("Save as: ")
print("Writing to file...")
with open(file_name + '.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(data)
else:
return
else:
return
else:
print("The search returned no results.")
#Program entry point
def main():
while True:
inputOutput()
inputVar = input('If you want to run the application again, enter "y". To exit, enter any key: ')
if inputVar != 'y':
break
main()

It turns out that when I save the Data to a .csv-file, the whitespaces are shown properly, so nevermind then.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Calling an inner function in Python - python

Related

Value not appending to global array

Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel

How to optimize the memory usage of my python crawler

Use of files on hard drives instead of url with python

Appending a list to another list replaces whitespaces with \xa0 in front of Integers

Categories

Resources