Im trying to write a code that solves a facility location problem. I have created a data structure in the variable data. data is a list with 4 lists in it. data[0] is a list of city names with a length of 128. The other three are irrelevant for now. There is also a function called nearbyCities(cityname, radius, data) which takes a city name, a radius, and the data and outputs a list of cities within the radius. Assuming that all the code mentioned is correct, why is the error:
File "/Applications/Wing101.app/Contents/MacOS/src/debug/tserver/_sandbox.py", line 232, in locateFacilities
File "/Applications/Wing101.app/Contents/MacOS/src/debug/tserver/_sandbox.py", line 162, in served
File "/Applications/Wing101.app/Contents/MacOS/src/debug/tserver/_sandbox.py", line 131, in nearbyCities
AttributeError: 'bool' object has no attribute 'index'
popping up?
Here's the three functions in question. r is the radius of the cities I am trying to serve. The first two are just helpers for the third which I am trying to call. The error is in the while loop I think.
def served(city, r, data, FalseList): #Helper Function 1
nearbycity=nearbyCities(city, r, data)
for everycity in nearbycity:
dex1=data[0].index(everycity)
FalseList[dex1]=True
return FalseList
def CountHowManyCitiesAreInRThatAreNotServed(city, FalseList, r, data): #Helper Function 2
NBC= nearbyCities(city, r, data)
notserved=0
for element in NBC:
if FalseList[data[0].index(element)] == False:
notserved= notserved+1
return notserved
def locateFacilities(data, r):
FalseList=[False]*128
Cities=data[0]
Radius=[]
output=[]
for everycity in Cities:
Radius.append(len(nearbyCities(everycity, r, data)))
maxito= max(Radius) #Take Radius and find the city that has the most cities in r radius from it.
dex= Radius.index(maxito)
firstserver=Cities[dex]
output.append(firstserver)
FalseList=served(firstserver, r, data, FalseList)
while FalseList.count(False) > 0:
WorkingCityList=[]
Radius2=[]
temp=[]
for everycity in Cities:
if FalseList[Cities.index(everycity)] == False:
Radius2.append(CountHowManyCitiesAreInRThatAreNotServed(everycity, FalseList, r, data))
temp.append(everycity)
maxito=max(Radius2)
dex = Radius2.index(maxito)
serverC= temp[dex]
output.append(serverC)
FalseList=served(serverC, r, FalseList, data)
output.sort()
return output
This is how the rest of the code starts
import re #Import Regular Expressions
def createDataStructure():
f=open('miles.dat') #Opens file
CITY_REG = re.compile(r"([^[]+)\[(\d+),(\d+)\](\d+)") #RegularExpression with a pattern that groups 3 diffrent elements. r" takes a raw string and each thing in parentheses are groups. The first group takes a string until there is actual brackets. The second starts at brackets with two integers sperated by a comma. The third takes an intger. The ending " ends the raw string.
CITY_TYPES = (str, int, int, int) # A conversion factor to change the raw string to the desired types.
#Initialized lists
Cities=[]
Coordinates=[]
Populations=[]
TempD=[]
FileDistances=[]
#Loop that reads the file line by line
line=f.readline()
while line:
match = CITY_REG.match(line) #Takes the compiled pattern and matches it. Returns false of not matched.
if match:
temp= [type(dat) for dat,type in zip(match.groups(), CITY_TYPES)] #Returns the matched string and converts it into the desired format.
# Moves the matched lists into individual lists
Cities.append(temp[0])
Coordinates.append([temp[1],temp[2]])
Populations.append(temp[3])
if TempD: #Once the distance line(s) are over and a city line is matched this appends the distances to a distance list.
FileDistances.append(TempD)
TempD=[]
elif not(line.startswith('*')): # Runs if the line isn't commented out with a "*" or a matched line (line that starts with a city).
g=line.split() #This chunck takes a str of numbers and converts it into a list of integers.
i=0
intline=[]
while i != len(g):
intline.append(int(g[i]))
i+=1
TempD.extend(intline)
line=f.readline()
f.close() #End parsing file
FileDistances.append(TempD) #Appends the last distance line
FileDistances.insert(0,[]) #For first list
i=0
j=1
while i!= 128: #Loop takes lists of distances and makes them len(128) with corresponding distances
FileDistances[i].reverse() #Reverses the current distance list to correspond with distance from city listed before.
FileDistances[i].append(0) #Appends 0 because at this point the city distance is it's self.
counter=i+1
while len(FileDistances[i]) != 128: #Loop that appends the other distnaces.
FileDistances[i].append(FileDistances[counter][-j])
counter=counter+1
j+=1
i+=1
cities=[]
for i in Cities: #Removes the commas. I dont know why we need to get rid of the commas...
new=i.replace(',','')
cities.append(new)
#Final product <3
MasterList=[cities, Coordinates, Populations, FileDistances]
return MasterList
getCoordinates
def getCoordinates(cityname, data): #Basic search function
INDEX=data[0].index(cityname)
return data[1][INDEX]
getPopulation
def getPopulation (cityname, data): #Basic search function
INDEX=data[0].index(cityname)
return data[2][INDEX]
getDistance
def getDistance (cityname1, cityname2, data): #Basic search function
INDEX=data[0].index(cityname1)
INDEX2=data[0].index(cityname2)
return data[3][INDEX][INDEX2]
nearbyCities
def nearbyCities(cityname, radius, data):
Cities=data[0]
INDEX=Cities.index(cityname)
workinglist=data[3][INDEX] #Data[3] is the distance list
IndexList=[]
index = 0
while index < len(workinglist): #Goes through the lists and outputs the indexes of cities in radius r
if workinglist[index] <= radius:
IndexList.append(index)
index += 1
output=[]
for i in IndexList: #Searches the indexes and appends them to an output list
output.append(Cities[i])
output.sort()
return output
The file miles.dat can be found at http://mirror.unl.edu/ctan/support/graphbase/miles.dat
Well, it appears that data[0] contains a boolean, and not a string. I tried this in an empty interpreter, and was able to raise the same exception.
It appears that there is an error in your data list's format. We would need to see that in order to figure out the true issue.
Related
I need to create a function that reads the data given and creates a list that contains tuples each of which has as its first element the name of the airport and as its second and third its geographical coordinates as float numbers.
airport_data = """
Alexandroupoli 40.855869°N 25.956264°E
Athens 37.936389°N 23.947222°E
Chania 35.531667°N 24.149722°E
Chios 38.343056°N 26.140556°E
Corfu 39.601944°N 19.911667°E
Heraklion 35.339722°N 25.180278°E"""
airports = []
import re
airport_data1 = re.sub("[°N#°E]","",airport_data)
def process_airports(string):
airports_temp = list(string.split())
airports = [tuple(airports_temp[x:x+3]) for x in range(0, len(airports_temp), 3)]
return airports
print(process_airports(airport_data1))
This is my code so far but I'm new to Python, so I'm struggling to debug my code.
If you want the second and third element of the tuple to be a float, you have to convert them using the float() function.
One way to do this is creating a tuple with round brackets in your list comprehension and convert the values there:
def process_airports(string):
airports_temp = string.split()
airports = [(airports_temp[x], float(airports_temp[x+1]), float(airports_temp[x+2])) for x in range(0, len(airports_temp), 3)]
return airports
This yields a pretty unwieldy expression, so maybe this problem could be solved more readable with a classical for loop.
Also note that slpit() already returns a list.
Further remark: If you just cut off the letters from coordinates this might come back to bite you when your airports are in different quadrants.
You need to take in account N/S, W/E for longitude and latitude.
May be
def process_airports(string):
airports = []
for line in string.split('\n'):
if not line: continue
name, lon, lat = line.split()
airports.append((name,
float(lon[:-2]) * (1 if lon[-1] == "N" else -1),
float(lat[:-2]) * (-1 if lat[-1] == "E" else 1)
))
return airports
>>> process_airports(airport_data1)
[('Alexandroupoli', 40.855869, -25.956264), ('Athens', 37.936389, -23.947222), ('Chania', 35.531667, -24.149722), ('Chios', 38.343056, -26.140556), ('Corfu', 39.601944, -19.911667), ('Heraklion', 35.339722, -25.180278)]
I prefered the double split to put in evidence the differences lines/tuple elements
I want to calculate the average vector length from a file that contains coordinates. Ultimately I want to store vector_length as a list called pair_length. I will calculate the average of the pair_length list later on in my program using average() function. Here is a snippet of my code:
from numpy import sqrt
from itertools import islice
from statistics import mean
data = open("coords.txt","r")
def average():
return mean()
pair_length = []
for line in islice(data, 1, None): #the first line is the number of pairs
fields = line.split(" ")
pair_num = int(fields[0]) #the first field is the pair number
x_cord = float(fields[1]) #x-coordinate
y_cord = float(fields[2]) #y-coordinate
vector_length = sqrt(x_cord**2 + y_cord**2) #vector length (all numbers in the coords.txt file are real and positive)
vector_length.append(pair_length)
I receive the error:
AttributeError: 'numpy.float64' object has no attribute 'append'
Here vector_length stores a float value, and hence append operation won't work with it.
Append operation works with lists, in python.
So, what we can do is:
Instead of
vector_length.append(pair_length)
We can do as follows:
pair_length.append(vector_length)
Hope this works.
I am trying to use python to parse a text file (stored in the var trackList) with times and titles in them it looks like this
00:04:45 example text
00:08:53 more example text
12:59:59 the last bit of example text
My regular expression (rem) works, I am also able to split the string (i) into two parts correctly (as in I separate times and text) but I am unable to then add the arrays (using .extend) that the split returns to a large array I created earlier (sLines).
f=open(trackList)
count=0
sLines=[[0 for x in range(0)] for y in range(34)]
line=[]
for i in f:
count+=1
line.append(i)
rem=re.match("\A\d\d\:\d\d\:\d\d\W",line[count-1])
if rem:
sLines[count-1].extend(line[count-1].split(' ',1))
else:
print("error on line: "+count)
That code should go through each line in the file trackList, test to see if the line is as expected, if so separate the time from the text and save the result of that as an array inside an array at the index of one less than the current line number, if not print an error pointing me to the line
I use array[count-1] as python arrays are zero indexed and file lines are not.
I use .extend() as I want both elements of the smaller array added to the larger array in the same iteration of the parent for loop.
So, you have some pretty confusing code there.
For instance doing:
[0 for x in range(0)]
Is a really fancy way of initializing an empty list:
>>> [] == [0 for x in range(0)]
True
Also, how do you know to get a matrix that is 34 lines long? You're also confusing yourself with calling your line 'i' in your for loop, usually that would be reserved as a short hand syntax for index, which you'd expect to be a numerical value. Appending i to line and then re-referencing it as line[count-1] is redundant when you already have your line variable (i).
Your overall code can be simplified to something like this:
# load the file and extract the lines
f = open(trackList)
lines = f.readlines()
f.close()
# create the expression (more optimized for loops)
expr = re.compile('^(\d\d:\d\d:\d\d)\s*(.*)$')
sLines = []
# loop the lines collecting both the index (i) and the line (line)
for i, line in enumerate(lines):
result = expr.match(line)
# validate the line
if ( not result ):
print("error on line: " + str(i+1))
# add an invalid list to the matrix
sLines.append([]) # or whatever you want as your invalid line
continue
# add the list to the matrix
sLines.append(result.groups())
I have a file whose contents are of the form:
.2323 1
.2327 1
.3432 1
.4543 1
and so on some 10,000 lines in each file.
I have a variable whose value is say a=.3344
From the file I want to get the row number of the row whose first column is closest to this variable...for example it should give row_num='3' as .3432 is closest to it.
I have tried in a method of loading the first columns element in a list and then comparing the variable to each element and getting the index number
If I do in this method it is very much time consuming and slow my model...I want a very quick method as this need to to called some 1000 times minimum...
I want a method with least overhead and very quick can anyone please tell me how can it be done very fast.
As the file size is maximum of 100kb can this be done directly without loading into any list of anything...if yes how can it be done.
Any method quicker than the method mentioned above are welcome but I am desperate to improve the speed -- please help.
def get_list(file, cmp, fout):
ind, _ = min(enumerate(file), key=lambda x: abs(x[1] - cmp))
return fout[ind].rstrip('\n').split(' ')
#root = r'c:\begpython\wavnk'
header = 6
for lst in lists:
save = database_index[lst]
#print save
index, base,abs2, _ , abs1 = save
using_data[index] = save
base = 'C:/begpython/wavnk/'+ base.replace('phone', 'text')
fin, fout = base + '.pm', base + '.mcep'
file = open(fin)
fout = open(fout).readlines()
[next(file) for _ in range(header)]
file = [float(line.partition(' ')[0]) for line in file]
join_cost_index_end[index] = get_list(file, float(abs1), fout)
join_cost_index_strt[index] = get_list(file, float(abs2), fout)
this is the code i was using..copying file into a list.and all please give better alternarives to this
Building on John Kugelman's answer, here's a way you might be able to do a binary search on a file with fixed-length lines:
class SubscriptableFile(object):
def __init__(self, file):
self._file = file
file.seek(0,0)
self._line_length = len(file.readline())
file.seek(0,2)
self._len = file.tell() / self._line_length
def __len__(self):
return self._len
def __getitem__(self, key):
self._file.seek(key * self._line_length)
s = self._file.readline()
if s:
return float(s.split()[0])
else:
raise KeyError('Line number too large')
This class wraps a file in a list-like structure, so that now you can use the functions of the bisect module on it:
def find_row(file, target):
fw = SubscriptableFile(file)
i = bisect.bisect_left(fw, target)
if fw[i + 1] - target < target - fw[i]:
return i + 1
else:
return i
Here file is an open file object and target is the number you want to find. The function returns the number of the line with the closest value.
I will note, however, that the bisect module will try to use a C implementation of its binary search when it is available, and I'm not sure if the C implementation supports this kind of behavior. It might require a true list, rather than a "fake list" (like my SubscriptableFile).
Is the data in the file sorted in numerical order? Are all the lines of the same length? If not, the simplest approach is best. Namely, reading through the file line by line. There's no need to store more than one line in memory at a time.
Code:
def closest(num):
closest_row = None
closest_value = None
for row_num, row in enumerate(file('numbers.txt')):
value = float(row.split()[0])
if closest_value is None or abs(value - num) < abs(closest_value - num):
closest_row = row
closest_row_num = row_num
closest_value = value
return (closest_row_num, closest_row)
print closest(.3344)
Output for sample data:
(2, '.3432 1\n')
If the lines are all the same length and the data is sorted then there are some optimizations that will make this a very fast process. All the lines being the same length would let you seek directly to particular lines (you can't do this in a normal text file with lines of different length). Which would then enable you to do a binary search.
A binary search would be massively faster than a linear search. A linear search will on average have to read 5,000 lines of a 10,000 line file each time, whereas a binary search would on average only read log2 10,000 ≈ 13 lines.
Load it into a list then use bisect.
I have a 384MB text file with 50 million lines. Each line contains 2 space-separated integers: a key and a value. The file is sorted by key. I need an efficient way of looking up the values of a list of about 200 keys in Python.
My current approach is included below. It takes 30 seconds. There must be more efficient Python foo to get this down to a reasonable efficiency of a couple of seconds at most.
# list contains a sorted list of the keys we need to lookup
# there is a sentinel at the end of list to simplify the code
# we use pointer to iterate through the list of keys
for line in fin:
line = map(int, line.split())
while line[0] == list[pointer].key:
list[pointer].value = line[1]
pointer += 1
while line[0] > list[pointer].key:
pointer += 1
if pointer >= len(list) - 1:
break # end of list; -1 is due to sentinel
Coded binary search + seek solution (thanks kigurai!):
entries = 24935502 # number of entries
width = 18 # fixed width of an entry in the file padded with spaces
# at the end of each line
for i, search in enumerate(list): # list contains the list of search keys
left, right = 0, entries-1
key = None
while key != search and left <= right:
mid = (left + right) / 2
fin.seek(mid * width)
key, value = map(int, fin.readline().split())
if search > key:
left = mid + 1
else:
right = mid - 1
if key != search:
value = None # for when search key is not found
search.result = value # store the result of the search
If you only need 200 of 50 million lines, then reading all of it into memory is a waste. I would sort the list of search keys and then apply binary search to the file using seek() or something similar. This way you would not read the entire file to memory which I think should speed things up.
Slight optimization of S.Lotts answer:
from collections import defaultdict
keyValues= defaultdict(list)
targetKeys= # some list of keys as strings
for line in fin:
key, value = line.split()
if key in targetKeys:
keyValues[key].append( value )
Since we're using a dictionary rather than a list, the keys don't have to be numbers. This saves the map() operation and a string to integer conversion for each line. If you want the keys to be numbers, do the conversion a the end, when you only have to do it once for each key, rather than for each of 50 million lines.
It's not clear what "list[pointer]" is all about. Consider this, however.
from collections import defaultdict
keyValues= defaultdict(list)
targetKeys= # some list of keys
for line in fin:
key, value = map( int, line.split())
if key in targetKeys:
keyValues[key].append( value )
I would use memory-maping: http://docs.python.org/library/mmap.html.
This way you can use the file as if it's stored in memory, but the OS decides which pages should actually be read from the file.
Here is a recursive binary search on the text file
import os, stat
class IntegerKeyTextFile(object):
def __init__(self, filename):
self.filename = filename
self.f = open(self.filename, 'r')
self.getStatinfo()
def getStatinfo(self):
self.statinfo = os.stat(self.filename)
self.size = self.statinfo[stat.ST_SIZE]
def parse(self, line):
key, value = line.split()
k = int(key)
v = int(value)
return (k,v)
def __getitem__(self, key):
return self.findKey(key)
def findKey(self, keyToFind, startpoint=0, endpoint=None):
"Recursively search a text file"
if endpoint is None:
endpoint = self.size
currentpoint = (startpoint + endpoint) // 2
while True:
self.f.seek(currentpoint)
if currentpoint <> 0:
# may not start at a line break! Discard.
baddata = self.f.readline()
linestart = self.f.tell()
keyatpoint = self.f.readline()
if not keyatpoint:
# read returned empty - end of file
raise KeyError('key %d not found'%(keyToFind,))
k,v = self.parse(keyatpoint)
if k == keyToFind:
print 'key found at ', linestart, ' with value ', v
return v
if endpoint == startpoint:
raise KeyError('key %d not found'%(keyToFind,))
if k > keyToFind:
return self.findKey(keyToFind, startpoint, currentpoint)
else:
return self.findKey(keyToFind, currentpoint, endpoint)
A sample text file created in jEdit seems to work:
>>> i = integertext.IntegerKeyTextFile('c:\\sampledata.txt')
>>> i[1]
key found at 0 with value 345
345
It could definitely be improved by caching found keys and using the cache to determine future starting seek points.
If you have any control over the format of the file, the "sort and binary search" responses are correct. The detail is that this only works with records of a fixed size and offset (well, I should say it only works easily with fixed length records).
With fixed length records, you can easily seek() around the sorted file to find your keys.
One possible optimization is to do a bit of buffering using the sizehint option in file.readlines(..). This allows you to load multiple lines in memory totaling to approximately sizehint bytes.
You need to implement binary search using seek()