Not sure what I am doing wrong... - ||| PYTHON ||| - python

I am doing a thing while following a tutorial. I think I did everything correct but when starting the program I am getting an error.
Here are my files codes:
1) the main file - frs.py
from parser import Parser
from lexer import Lexer
def main():
filename = 'hello.frs'
file = open(filename, 'r')
lexer = Lexer(file)
parser = Parser(lexer.tokens)
lexer.tokenizer()
print ("TOKENS:")
print (lexer.tokens, "\n")
parser.build_AST()
print ("AST:")
print (parset.AST, "\n")
if __name__ == "__main__":
main()
2) the Lexer class - lexer.py
class Lexer:
def __init__(self, data):
self.data = data
self.tokens = []
self.keywords = [
'tosay'
]
def tokenizer(self):
for loc in self.data:
tmp = []
tid = ''
for l in loc:
if l == '"' and tid == '':
tid = 'char'
tmp = []
elif l == '"' and tid == 'char':
self.tokens.append({'id': tid, 'value': ''.join(tmp)})
tid = ''
tmp = []
elif l == ':':
self.tokens.append({'id': 'label', 'value': ''.join(tmp)})
tmp = []
elif ''.join(tmp) in self.keywords:
self.tokens.append({'id': 'keyword', 'value': ''.join(tmp)})
tmp = []
elif l == ' ' and tid != 'char':
continue
else:
tmp.append(l)
3) the Parser class - parser.py
class Parser:
def __init__(self, tokens):
self.tokens = tokens
self.AST = []
def add_node(self, parent, node):
for a in self.AST:
if parent in a:
a[parent].append(node)
def build_AST(self):
saved = {}
parent = {}
collect = False
for token in self.tokens:
if token['id'] == 'label':
t = {token['value']: []}
if parent != t:
parent = token['value']
self.AST.append(t)
elif token['id'] == 'keyword':
if token['value'] == 'stop':
t = {token['value']: 0}
self.add_node(parent, t)
else:
if collect == False:
saved = token
collect = True
else:
t = {saved['value']: token[:value]}
self.add_node(parent, t)
collect = False
elif token['id'] == 'char':
if collect = False:
saved = token
collect = True
else:
t = {saved['value']: token['value']}
self.add_node(parent, t)
collect = False
4) the file with my own language and is a goal of the tutorial - hello.frs:
commence:
tosay "Hello World"
stop
Basically, until I added the from parser import Parser, everything worked. But after adding, I am getting this error message:
Traceback (most recent call last):
File "frs.py", line 1, in <module>
from parser import Parser
ImportError: cannot import name 'Parser'
I tried renaming the class, but it still doesn't work.
Please help me!
Thank you in advance.

Two errors in your files.
1) File parser.py:
Change:
if collect = False:
To
if collect == False:
2) File frs.py
Change:
print (parset.AST, "\n")
To:
print (parser.AST, "\n")`
After Above Corrections My Output
TOKENS:
[{'id': 'label', 'value': 'commence'}, {'id': 'keyword', 'value': 'tosay'}, {'id': 'char', 'value': 'Hello World'}]
AST:
[{'commence': [{'tosay': 'Hello World'}]}]

Related

For Loop in Function in Python

I'm trying to shift some codes from my main function to a function called read but somehow the loop breaks and it doesn't go through my csv file.
Below are the 2 scripts and the csv.
Thank you for your advice and tips as the learning curve is getting steeper and steeper
---Code Below---
The script 'NotinFunct' will read the csv file and returns this data
The script 'InFunct' will read the same csv file but only returns one set a data
The 'NotinFunct' is
# -*- coding: utf-8 -*-
import csv
FILE = 'C://shared//API//NADEV-Numbers_20190220-092956.csv'
NBS = {'5684', '7445477'}
NEW_NBS = {'56847', '74454773'}
def main():
fields_route = {'Pattern', 'CalledX', 'CalledPrefix', 'CallingX', 'CallingPrefix'}
for row in csv.DictReader(open(FILE)):
if row['Type'] == 'RoutePattern':
for nb in NBS:
for field in fields_route:
if nb in row[field]:
for new in NEW_NBS:
if nb in new:
rp = row['Pattern']
pt = row['Partition']
newrp = row['Pattern'].replace(nb, new)
if row['CalledX'] == 'None':
cedp = row['CalledX'].replace('None', '')
else:
cedp = row['CalledX'].replace(nb, new)
if row['CalledPrefix'] == 'None':
pced = row['CalledPrefix'].replace('None', '')
else:
pced = row['CalledPrefix'].replace(nb, new)
if row['CallingX'] == 'None':
cingp = row['CallingX'].replace('None', '')
else:
cingp = row['CallingX'].replace(nb, new)
if row['CallingPrefix'] == 'None':
pcing = row['CallingPrefix'].replace('None', '')
else:
pcing = row['CallingPrefix'].replace(nb, new)
print(rp)
print(pt)
print(newrp)
print(cedp)
print(pced)
print(cingp)
print(pcing)
print('################')
if __name__ == '__main__':
main()
the 'InFunct' is
# -*- coding: utf-8 -*-
import csv
FILE = 'C://shared//API//NADEV-Numbers_20190220-092956.csv'
NBS = {'5684', '7445477'}
NEW_NBS = {'56847', '74454773'}
def read():
fields_route = {'Pattern', 'CalledX', 'CalledPrefix', 'CallingX', 'CallingPrefix'}
for row in csv.DictReader(open(FILE)):
if row['Type'] == 'RoutePattern':
for nb in NBS:
for field in fields_route:
if nb in row[field]:
for new in NEW_NBS:
if nb in new:
rp = row['Pattern']
pt = row['Partition']
newrp = row['Pattern'].replace(nb, new)
if row['CalledX'] == 'None':
cedp = row['CalledX'].replace('None', '')
else:
cedp = row['CalledX'].replace(nb, new)
if row['CalledPrefix'] == 'None':
pced = row['CalledPrefix'].replace('None', '')
else:
pced = row['CalledPrefix'].replace(nb, new)
if row['CallingX'] == 'None':
cingp = row['CallingX'].replace('None', '')
else:
cingp = row['CallingX'].replace(nb, new)
if row['CallingPrefix'] == 'None':
pcing = row['CallingPrefix'].replace('None', '')
else:
pcing = row['CallingPrefix'].replace(nb, new)
return rp, pt, newrp, cedp, pced, cingp, pcing
def main():
for test in read():
print(test)
if __name__ == '__main__':
main()
the csv is
Type,Pattern,Partition,Description,CalledX,CalledPrefix,CallingX,CallingPrefix,FwdAll,FwdBusyInt,FwdBusyExt,FwdNAnsInt,FwdNAnsExt,FwdNCovInt,FwdNCovExt,FwdCTIFail,FwdURegInt,FwdURegExt,ExtPNMask,Device
DirectoryNumber,875423,a_nothing_partition,a_nothing_DN,N/A,N/A,N/A,N/A,11,22,33,44,55,66,744547722,77,88,99,9898,SEP798798465143
DirectoryNumber,5684001,a_nothing_partition,None,N/A,N/A,N/A,N/A,None,None,None,None,None,None,None,None,None,None,N/A,N/A
TranslationPattern,568412,a_nothing_partition,a_nothing_tp,None,None,None,5236,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
TranslationPattern,568411,a_nothing_partition,a_nothing_tp,None,None,875421,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
TranslationPattern,744547720,a_nothing_partition,a_nothing_tp,961433,None,None,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
TranslationPattern,744547721,a_nothing_partition,a_nothing_tp,None,786512,None,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
TranslationPattern,47852,a_nothing_partition,a_nothing_tp,None,None,744547711,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
TranslationPattern,9632,a_nothing_partition,a_nothing_tp,None,None,None,5684,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
TranslationPattern,897435496,a_nothing_partition,a_nothing_tp,568433,None,None,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
TranslationPattern,7896312145697,a_nothing_partition,a_nothing_tp,None,7445477,None,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
RoutePattern,6568433,a_nothing_partition,None,None,None,None,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
RoutePattern,6568434,a_nothing_partition,None,None,None,None,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
RoutePattern,24132,a_nothing_partition,a_nothing_rp,None,None,7445477,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
HuntPilot,568444,a_nothing_partition,a_nothing_hunt pilot,88,99,66,77,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
CingPartyX,8787,a_nothing_partition,a_nothing_calling party X,N/A,N/A,11,744547722,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
CedPartyX,98563,a_nothing_partition,a_nothing_called party X,N/A,N/A,568496,None,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
Your read function only returns one set of values (the last one). If you changed the function to a generator you can get all the values.
Change the end of the read function to the following, making sure to align the yield to the innermost loop block:
...
if row['CallingPrefix'] == 'None':
pcing = row['CallingPrefix'].replace('None', '')
else:
pcing = row['CallingPrefix'].replace(nb, new)
yield rp, pt, newrp, cedp, pced, cingp, pcing
Then you get:
('6568433', 'a_nothing_partition', '65684733', '', '', '', '')
('6568434', 'a_nothing_partition', '65684734', '', '', '', '')
('24132', 'a_nothing_partition', '24132', '', '', '74454773', '')
Change your main function to the following to get similar output to NotInFunct:
def main():
for test in read():
for col in test:
print(col)
print('################')
Output:
6568433
a_nothing_partition
65684733
#################
6568434
a_nothing_partition
65684734
#################
24132
a_nothing_partition
24132
74454773
#################

PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed

output_rdpartition = mp.Queue()
def read_partition_zipfile(infile,stop_words,startline,endline):
# endline = startline + 100
chunk_user_d = defaultdict(lambda: defaultdict(list))
chunk_user_withoutstamp_d = defaultdict(list)
with gzip.open(in_file, "rb") as f:
for j, line in enumerate(f):
if j >= startline and j < endline:
if j%10000==0 : print "processed",j,"lines"
line = line[:-1].split("|:|")
time_stamp = int(line[0])
user_id = line[-1]
keywords=line[1].split(',')
keywords = [item.lower() for item in keywords if len(item)>=2]
keywords = [item for item in keywords if item not in stop_words]
# print 'user_id', user_id
# print 'time_stamp', time_stamp
# print 'keywords',keywords
chunk_user_d[user_id][time_stamp] += keywords
chunk_user_withoutstamp_d[user_id] +=keywords
# print chunk_user_withoutstamp_d,'chunk_user_withoutstamp_d'
# return chunk_user_d, chunk_user_withoutstamp_d
output_rdpartition.put((chunk_user_d,chunk_user_withoutstamp_d))
def main():
start_time = datetime.datetime.now()
print("at the start of main")
user_id ='1ss7fef4'
lenth = 0
tf_idf = defaultdict(int)
key_dic = defaultdict(float)
time_latest = 0
processes_rd = [mp.Process(target = read_partition_zipfile, args =(in_file, stop_words, p_index[j], p_index[j+1])) for j in range(0,3)]
for p in processes_rd:
p.start()
results_rd = [output_rdpartition.get() for p in processes_rd]
# results_rd[0]is the chunkuser ,results_rd[1]is the chunkuser_without stamp
print results_rd
if __name__ == '__main__':
stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
stop_words = stop_words.split(",")
in_file = 'uniq.txt.gz'
p_index = range(0,28000000,2800000)
main()
It seems that it is because of the queue issue, i can print within function ,but i can not return the output of the function

Python syntax error, command, *args= line.split()

I am getting a Python syntax error for command, *args = line.split()
Here is my bce.py file
import sys
import os
import pcn
import urp
class BCE(object):
def __init__(self, inPath, outPath):
self.inPath = inPath
self.outPath = outPath
self.eqs = {}
self.operations = {
"r": self.read_pcn,
"!": self.do_not,
"+": self.do_or,
"&": self.do_and,
"p": self.write_pcn,
"q": self.quit
}
self.done = False
def process(self, commandFilePath):
with open(commandFilePath, "r") as f:
for line in f:
command, *args = line.split()
self.operations[command](*args)
if self.done:
return
def read_pcn(self, fNum):
_, self.eqs[fNum] = pcn.parse(os.path.join(self.inPath, fNum + ".pcn"))
def write_pcn(self, fNum):
with open(os.path.join(self.outPath, fNum + ".pcn"), "w") as f:
pcn.write(f, None, self.eqs[fNum])
def do_not(self, resultNum, inNum):
self.eqs[resultNum] = urp.complement(self.eqs[inNum])
def do_or(self, resultNum, leftNum, rightNum):
self.eqs[resultNum] = urp.cubes_or(self.eqs[leftNum], self.eqs[rightNum])
def do_and(self, resultNum, leftNum, rightNum):
self.eqs[resultNum] = urp.cubes_and(self.eqs[leftNum], self.eqs[rightNum])
def quit(self):
self.done = True
Usage = """\
USAGE: {} COMMAND_FILE
"""
if __name__ == "__main__":
if len(sys.argv) > 1:
solutionDir = "BCESolutions"
thisSolDir = os.path.join(solutionDir, sys.argv[1][-5])
try:
os.mkdir(thisSolDir)
except OSError:
# It's okay if it's already there
pass
bce = BCE("BooleanCalculatorEngine", thisSolDir)
bce.process(sys.argv[1])
else:
print(Usage.format(sys.argv[0]))
And here is my pcn.py file
from itertools import islice
from itertools import chain
def parse(filePath):
with open(filePath, "rb") as f:
# First line is size of array
try:
lines = iter(f)
numVars = int(next(lines))
cubeCount = int(next(lines))
cubes = [None]*cubeCount
for i in range(cubeCount):
line = next(lines)
cubes[i] = tuple(islice(map(int, line.split()), 1, None))
return (numVars, tuple(cubes))
except Exception as error:
raise AssertionError("Bad pcn file {}".format(filePath)) from error
def write(f, numVars, cubes):
endl = "\n"
f.write(str(max(max(map(abs, cube)) for cube in cubes)))
f.write(endl)
f.write(str(len(cubes)))
f.write(endl)
cubes = tuple(set(tuple(sorted(cube, key=abs)) for cube in cubes))
for cube in cubes:
f.write(' '.join(map(str, chain((len(cube),), cube))))
f.write(endl)
f.write(endl)
Tuple assignment with a *star_target entry only works in Python 3. You cannot use it in Python 2. See PEP 3132 - Extended Iterable Unpacking.
As a workaround, just just one target then use slicing:
split_result = line.split()
command, args = split_result[0], split_result[1:]

The reading loop of QXmlReader for PyQt5 does not return the expected data

I'd like to make an QAbstractItemModel that gets its data from a series of Xml files, all situated in the same directory. Since PyQt5 no longer supports QDomDocument (or atleast i couldn't find a way to make it work), i've had to resort to a QXmlStreamReader. I'm putting the data itself in a giant python dictionary (well... not exactly giant by computer science standards) that contains other dictionaries under various keys to create a tree-like structure.
this is my code so far:
class DataModel(QtCore.QAbstractItemModel):
def __init__(self, settingsDirectory, parent = None):
super(DataModel, self).__init__(parent)
settingsDirectory.setNameFilters(["*.xml"])
files = settingsDirectory.entryList()
print(files)
self.data = {}
for i in range(len(files)):
filePath = str(files[i])
file = QtCore.QFile(settingsDirectory.absolutePath() + "/" + str(filePath))
fileOpens = file.open(file.ReadOnly | file.Text)
if fileOpens:
parser = QtCore.QXmlStreamReader(file)
print("--------Beginning parsing----------")
print("Reading file: "+str(filePath))
while not parser.atEnd():
parser.readNext()
token = parser.tokenType()
print("Reading tag: " + str(parser.name()))
print("Tag type is: " + str(token))
if token == parser.StartDocument:
self.data["XML Version"] = str(parser.documentVersion())
self.data["XML Encoding"] = str(parser.documentEncoding())
if token == parser.StartElement:
tokenName = parser.name()
if parser.tokenType() == parser.Characters:
tokenText = parser.text()
print("This tag has a text value: " + str(tokenText))
print("current data: " + str(self.data))
if token == parser.EndElement:
if tokenText != None:
self.data[tokenName] = tokenText
else:
self.data[tokenName] = {}
tokenName = None
tokenText = None
else:
print(self.tr("xml file did not open properly"))
print(self.data)
While this code doesn't crash or anything, it does have a few issues that i have no idea why they're happening or how to fix:
1.the tokenName never changes from None for some reason - solved
2.the structure of the self.data dictionary does not turn into a tree-like one, no idea why :|
example data:
<?xml version="1.0" encoding="UTF-8"?>
<tag>
<description>This is a text</description>
<types>
<typesAllowed></typesAllowed>
<typesEnabled></typesEnabled>
</types>
</tag>
yields the final result:
{'XML Encoding': 'UTF-8', 'XML Version': '1.0', 'typesAllowed': '\n\t\t', None: '\n', 'typesEnabled': '\n\t\t', 'description': 'This is a text'}
instead of the wanted:
{'XML Encoding': 'UTF-8', 'XML Version': '1.0', 'tag': {'description': 'this is a text', typesAllowed': '\n\t\t', 'typesEnabled': '\n\t\t'}}
I know these issues are most likely a result of my poor understanding of how a StreamReader works, so any and all tips would be welcome :)
edit 1:
the tokenName change was a silly positioning error, silly me. the code reflects the fix.
edit 2:
added an example and example output
This question is now solved; I took a different approach to the problem.
I basically took a list into which i appended tuples (name, {}) if the StartElement token had the attribute parseAs == "element" and put an evaluated string (parseText function) into the last tuple's dictionary. When it meets an EndElement token, it finds the tuple with name == tokenName, which is the name of the current token, puts it into the previous tuple's dictionary as an entry with key name.
There's a few more details as to how it works, but I'd probably just overly complicate my explanation if I included them (how it knows when to submit currData to self.data etc.)
class DataModel(QtCore.QAbstractItemModel):
def __init__(self, settingsDirectory, parent = None):
super(DataModel, self).__init__(parent)
settingsDirectory.setNameFilters(["*.xml"])
files = settingsDirectory.entryList()
print(files)
self.data = {}
self.parsingLog = {}
for i in range(len(files)):
filePath = str(files[i])
file = QtCore.QFile(settingsDirectory.absolutePath() + "/" + str(filePath))
fileOpens = file.open(file.ReadOnly | file.Text)
if fileOpens:
parser = QtCore.QXmlStreamReader(file)
currData = []
haveStartToken = False
print(self.tr("--------Beginning parsing--------"))
print(self.tr("Reading file: "+str(filePath)))
print(self.tr("---------------------------------"))
while not parser.atEnd():
if not parser.hasError():
parser.readNext()
token = parser.tokenType()
print(self.tr("--------------------"))
print(self.tr("Token type: " + str(self.printTokenType(token))))
if token == parser.StartElement:
tokenName = parser.name()
attributes = parser.attributes()
parseAs = attributes.value("parseAs")
print(self.tr("Reading StartElement: " + str(tokenName)))
print(self.tr("parseAs: " + str(parseAs)))
if parseAs == "text":
textValue = self.parseText(parser.readElementText())
print(self.tr("Text Value: " + str(textValue)))
if len(currData) != 0:
currData[len(currData)-1][1][tokenName] = textValue
else:
print(self.tr("*******Terminating application*******"))
print(self.tr("Reason: currData is empty"))
print(self.tr("*******Terminating application*******"))
sys.exit()
elif parseAs == "element":
currData.append((tokenName, {}))
else:
print(self.tr("******WARNING******"))
print(self.tr("parseAs attribute is not given correctly"))
print(self.tr("******WARNING******"))
print(self.tr("--------------------"))
elif token == parser.EndElement:
tokenName = parser.name()
print(self.tr("Reading EndElement: " + str(tokenName)))
print(self.tr("currData before: " + str(currData)))
if not haveStartToken:
startToken = currData[0][0]
haveStartToken = True
for i in currData:
if i[0] == tokenName:
print(self.tr("Closing token: " + str(tokenName)))
if i[0] != startToken:
currData[len(currData)-2][1][tokenName] = currData[len(currData)-1][1]
del currData[len(currData)-1]
print(self.tr("currData after: " + str(currData)))
print(self.tr("--------------------"))
elif i[0] == startToken:
print(self.tr("This is the final token, writing to self.data"), end = "")
self.data[startToken] = currData[0][1]
for i in range(5):
time.sleep(0.25)
print(self.tr("."), end = "")
print(self.tr("done."))
print(self.tr("--------------------"))
elif token == parser.Characters:
print(self.tr("Characters value: " + str(parser.text())))
print(self.tr("--------------------"))
elif token == parser.StartDocument:
self.parsingLog["File: "+str(filePath)] = {}
self.parsingLog["File: "+str(filePath)]["XML Version"] = str(parser.documentVersion())
self.parsingLog["File: "+str(filePath)]["XML Encoding"] = str(parser.documentEncoding())
print(self.tr("File Version: " + str(self.parsingLog["File: "+str(filePath)]["XML Version"])))
print(self.tr("File Encoding: " + str(self.parsingLog["File: "+str(filePath)]["XML Encoding"])))
elif token == parser.EndDocument:
print(self.tr("Cleaning up"), end = "")
for i in range(5):
time.sleep(0.25)
print(self.tr("."), end = "")
time.sleep(0.1)
print(self.tr("done."))
print(self.tr("self.data: " + str(self.data)))
print(self.tr("types of data: yesNo (should be str) - " +
str(type(self.data["building"]["specialSlot"]["yesNo"])) +
" - id - should be int - " + str(type(self.data["building"]["specialSlot"]["id"])) +
" - isItFloat - should be float - " + str(type(self.data["building"]["specialSlot"]["isItFloat"]))))
print(self.tr("--------------------"))
else:
print(self.tr("XML file is not well-formatted"))
else:
print(self.tr("xml file did not open properly"))
def parseText(self, text):
if isinstance(text, str):
if text == "":
return str(text)
for i in text:
if i not in ("0123456789."):
return str(text)
for j in text:
if j not in ("0123456789"):
return float(text)
return int(text)
else:
return ValueError
def printTokenType(self, token):
if token == QtCore.QXmlStreamReader.NoToken:
return "NoToken"
elif token == 1:
return "Invalid"
elif token == QtCore.QXmlStreamReader.StartDocument:
return "StartDocument"
elif token == QtCore.QXmlStreamReader.EndDocument:
return "EndDocument"
elif token == QtCore.QXmlStreamReader.StartElement:
return "StartElement"
elif token == QtCore.QXmlStreamReader.EndElement:
return "EndElement"
elif token == QtCore.QXmlStreamReader.Characters:
return "Characters"
elif token == QtCore.QXmlStreamReader.Comment:
return "Comment"
elif token == QtCore.QXmlStreamReader.DTD:
return "DTD"
elif token == QtCore.QXmlStreamReader.EntityReference:
return "EntityReference"
elif token == QtCore.QXmlStreamReader.ProcessingInstruction:
return "ProcessingInstruction"

replace method does work (python2.7)

I want to use following codes to replace strings like "/xxxxx/" with "/xxxxx.html" in the page_data, but doesn't work. page_data is bytes type which is downloaded by a crawler.
page_data.replace(each, neweach)
Only when I change them to:
page_data = page_data.replace(each, neweach)
the strings(each) in page_data are actually replaceed.
The whole code is below:
import os
import sys
import re
import urllib
import urllib2
class WebGet(object):
base_url = ""
urls_list = []
history_list = []
replace_ch={}
def __init__(self, base_url):
self.base_url = base_url[:-1]
self.urls_list.append('/')
self.replace_ch[">>"] = "%3E%3E"
self.replace_ch["<<"] = "%3C%3C"
self.replace_ch["::"] = "%3A%3A"
def recurseGet(self):
'''Get page data recursively'''
while(len(self.urls_list) != 0):
url_suffix = self.urls_list[0]
self.urls_list.remove(url_suffix)
self.history_list.append(url_suffix)
url_to_get = self.base_url + url_suffix
"Get page data with url"
print "To get",url_to_get
page_data = urllib2.urlopen(url_to_get).read()
page_data_done = self.pageHandle(page_data)
"Write the page data into file"
if url_suffix[-1] == '/':
url_suffix = url_suffix[:-1]
if url_suffix == '':
url_suffix = "index"
elif url_suffix[0] == '/':
url_suffix = url_suffix[1:]
url_suffix.replace('/','\\')
url_suffix.replace('>>','%3E%3E')
url_suffix.replace('<<','%3C%3C')
url_suffix.replace('::','%3A%3A')
file_str = "e:\\reference\\"+url_suffix
if file_str.rfind("\\") != 12:
new_dir = file_str[:file_str.rfind("\\")]
if os.path.isdir(file_str) == False:
os.mkdir(file_str)
file_str = file_str.strip()+".html"
print "write file",file_str
f_page = open(file_str, "wb")
f_page.write(page_data_done)
f_page.close
def pageHandle(self, page_data):
page_data.replace("http://www.cplusplus.com/","/") #here the replace works
re_rule = '<a href="/reference(/\S{2,40}/)\">'
list_page_urls = re.findall(re_rule, page_data)
for each in list_page_urls:
neweach = each
neweach = neweach[:-1]+".html"
#page_data = page_data.replace(each, neweach)
page_data.replace(each, neweach)
if each in page_data:
print "fail replace"
if each in self.history_list:
continue
elif each in self.urls_list:
continue
elif each == '/':
continue
self.urls_list.append(each)
return page_data
def main():
url = "http://www.cplusplus.com/reference/"
fc = WebGet(url)
fc.recurseGet()
if __name__ == "__main__":
main()
Why could be this?
Because that's what the replace method does: returns a copy of the string with the relevant characters replaced.
Apart from anything else, strings are immutable in Python, so it couldn't work any other way.

Categories