How to combine three columns into one in python - python

Excel table = this is the excel file screenshot which is how final result should be. Please take closer look at "Lifestyle" section.
I can't figure out how to make my python just like the excel picture screenshot. "Lifestyle" section needs to have 2 more sub-columns combined just like in a picture below. Any help would be appreciated.
I'm gonna post picture below PyCharm screenshot:
Here is my code:
#convert inches to feet-inches
def inch_to_feet(x):
feet = x // 12
inch = x % 12
return str(feet)+"'"+str(inch)+'"'
#file opened
print("Hello")
roster = input("Please enter a roster file: ")
if roster != "roster_extended.csv":
print("Invalid name")
elif roster == "roster_extended.csv":
additional_name = input("There are 13 lines in this file. Would you like to enter an additional names? (Y/N): ")
if additional_name == "Y":
input("How many more names?: ")
infile = open("roster_extended.csv", 'r')
b = infile.readline()
b = infile.readlines()
header = '{0:>12} {1:>35} {2:>3} {3:>16} {4:>5} {5:>3} {6:>9}'.format("FirstName","LastName","Age","Occupation","Ht","Wt","lifestyle")
print(header)
with open("roster_extended.csv", "a+") as infile:
b = infile.write(input("Enter first name: "))
for person in b:
newperson = person.replace("\n", "").split(",")
newperson[4] = eval(newperson[4])
newperson[4] = inch_to_feet(newperson[4])
newperson
formatted='{0:>12} {1:>35} {2:>3} {3:>16} {4:>5} {5:>3} {6:>9}'.format(newperson[0],newperson[1],newperson[2],newperson[3],newperson[4],newperson[5],newperson[6])
print(formatted)
Here is the output I get:
FirstName LastName Age Occupation Ht Wt lifestyle
Anna Barbara 35 nurse 5'3" 129
Catherine Do 45 physicist 5'5" 135
Eric Frederick 28 teacher 5'5" 140
Gabriel Hernandez 55 surgeon 5'7" 150 x
Ivy Joo 31 engineer 5'2" 126 x
Kelly Marks 21 student 5'4" 132
Nancy Owens 60 immunologist 5'8" 170 x
Patricia Qin 36 dental assistant 4'11" 110 x
Roderick Stevenson 51 bus driver 5'6" 160 x
Tracy Umfreville 42 audiologist 5'7" 156 x
Victoria Wolfeschlegelsteinhausenbergerdorff 38 data analyst 5'8" 158
Lucy Xi 49 professor 5'9" 161
Yolanda Zachary 58 secretary 5'10" 164 x

Brief explanation of the solution:
You gave tabulated data as input (there are several ways to tabulate: check here). Since you're starting with python the solution keeps within standard library (thus not resorting to external libraries). Only format() and class variables are used to keep track of column width (if you delete elements you'll want to update the variables.) This programmatically automates tabulation.
Since you are starting out, I recommend putting a breakpoint in __init__() and __new__() to observe their behavior.
I used Enum because conceptually it's the right tool for the job. You only need to understand Enum.name and Enum.value, as for everything else consider it a normal class.
There are 2 output files, one in tabulated form and the other in barebone csv.
(For the most part the solution is "canonical" (or close). The procedural part was rushed, but gives a sufficient idea.)
import csv
import codecs
from enum import Enum
from pathlib import Path
IN_FILE = Path("C:\\your_path\\input.csv")
OUT_FILE = Path("C:\\your_path\\output1.csv")
OUT_FILE_TABULATE = Path("C:\\your_path\\output2.csv")
def read_csv(file) -> list:
with open(file) as csv_file:
reader_csv = csv.reader(csv_file, delimiter=',')
for row in reader_csv:
yield row
def write_file(file, result_ordered):
with codecs.open(file, "w+", encoding="utf-8") as file_out:
for s in result_ordered:
file_out.write(s + '\n')
class LifeStyle(Enum):
Sedentary = 1
Active = 2
Moderate = 3
def to_list(self):
list_life_style = list()
for one_style in LifeStyle:
if one_style is self:
list_life_style.append('x')
else:
list_life_style.append('')
return list_life_style
def tabulate(self):
str_list_life_style = list()
for one_style in LifeStyle:
if one_style is not self:
str_list_life_style.append('{: ^{width}}'.format(' ', width=len(one_style.name)))
else:
str_list_life_style.append('{: ^{width}}'.format('x', width=len(self.name)))
return str_list_life_style
def tabulate_single_column(self):
return '{: >{width}}'.format(str(self.name), width=len(LifeStyle.Sedentary.name))
#staticmethod
def header_single_column():
return ' {}'.format(LifeStyle.__name__)
#staticmethod
def header():
return ' {} {} {}'.format(
LifeStyle.Sedentary.name,
LifeStyle.Active.name,
LifeStyle.Moderate.name,
)
class Person:
_FIRST_NAME = "First Name"
_LAST_NAME = "Last Name"
_AGE = "Age"
_OCCUPATION = "Occupation"
_HEIGHT = "Height"
_WEIGHT = "Weight"
max_len_first_name = len(_FIRST_NAME)
max_len_last_name = len(_LAST_NAME)
max_len_occupation = len(_OCCUPATION)
def __new__(cls, first_name, last_name, age, occupation, height, weight, lifestyle):
cls.max_len_first_name = max(cls.max_len_first_name, len(first_name))
cls.max_len_last_name = max(cls.max_len_last_name, len(last_name))
cls.max_len_occupation = max(cls.max_len_occupation, len(occupation))
return super().__new__(cls)
def __init__(self, first_name, last_name, age, occupation, height, weight, lifestyle):
self.first_name = first_name
self.last_name = last_name
self.age = age
self.occupation = occupation
self.height = height
self.weight = weight
self.lifestyle = lifestyle
#classmethod
def _tabulate_(cls, first_name, last_name, age, occupation, height, weight):
first_part = '{: >{m_first}} {: >{m_last}} {: >{m_age}} {: <{m_occup}} {: <{m_height}} {: >{m_weight}}'.format(
first_name,
last_name,
age,
occupation,
height,
weight,
m_first=Person.max_len_first_name,
m_last=Person.max_len_last_name,
m_occup=Person.max_len_occupation,
m_age=len(Person._AGE),
m_height=len(Person._HEIGHT),
m_weight=len(Person._WEIGHT))
return first_part
#classmethod
def header(cls, header_life_style):
first_part = Person._tabulate_(Person._FIRST_NAME, Person._LAST_NAME, Person._AGE, Person._OCCUPATION,
Person._HEIGHT, Person._WEIGHT)
return '{}{}'.format(first_part, header_life_style)
def __str__(self):
first_part = Person._tabulate_(self.first_name, self.last_name, self.age, self.occupation, self.height,
self.weight)
return '{}{}'.format(first_part, ' '.join(self.lifestyle.tabulate()))
def single_column(self):
first_part = Person._tabulate_(self.first_name, self.last_name, self.age, self.occupation, self.height,
self.weight)
return '{} {}'.format(first_part, self.lifestyle.tabulate_single_column())
def populate(persons_populate):
for line in read_csv(IN_FILE):
life_style = ''
if line[6] == 'x':
life_style = LifeStyle.Sedentary
elif line[7] == 'x':
life_style = LifeStyle.Moderate
elif line[8] == 'x':
life_style = LifeStyle.Active
persons_populate.append(Person(line[0], line[1], line[2], line[3], line[4], line[5], life_style))
return persons_populate
persons = populate(list())
print(Person.header(LifeStyle.header()))
for person in persons:
print(person)
write_file(OUT_FILE_TABULATE, [str(item) for item in persons])
# add new persons here
persons.append(Person("teste", "teste", "22", "worker", "5'8\"", "110", LifeStyle.Active))
final_list = list()
for person in persons:
one_list = [person.first_name, person.last_name, person.age, person.occupation, person.height,
person.weight]
one_list.extend([item.strip() for item in person.lifestyle.tabulate()])
final_list.append(','.join(one_list))
write_file(OUT_FILE, final_list)
print("\n", Person.header(LifeStyle.header_single_column()))
for person in persons:
print(person.single_column())
output1.csv:
Anna,Barbara,35,nurse,5'3",129,,,x
Catherine,Do,45,physicist,5'5",135,,x,
Eric,Frederick,28,teacher,5'5",140,,,x
Gabriel,Hernandez,55,surgeon,5'7",150,x,,
Ivy,Joo,31,engineer,5'2",126,x,,
Kelly,Marks,21,student,5'4",132,,x,
Nancy,Owens,60,immunologist,5'8",170,x,,
Patricia,Qin,36,dental assistant,4'11",110,x,,
Roderick,Stevenson,51,bus driver,5'6",160,x,,
Tracy,Umfreville,42,audiologist,5'7",156,x,,
Victoria,Wolfeschlegelsteinhausenbergerdorff,38,data analyst ,5'8",158,,,x
Lucy,Xi,49,professor,5'9",161,,,x
Yolanda,Zachary,58,secretary,5'10",164,x,,
teste,teste,22,worker,5'8",110,,x,
output2.csv:
Anna Barbara 35 nurse 5'3" 129 x
Catherine Do 45 physicist 5'5" 135 x
Eric Frederick 28 teacher 5'5" 140 x
Gabriel Hernandez 55 surgeon 5'7" 150 x
Ivy Joo 31 engineer 5'2" 126 x
Kelly Marks 21 student 5'4" 132 x
Nancy Owens 60 immunologist 5'8" 170 x
Patricia Qin 36 dental assistant 4'11" 110 x
Roderick Stevenson 51 bus driver 5'6" 160 x
Tracy Umfreville 42 audiologist 5'7" 156 x
Victoria Wolfeschlegelsteinhausenbergerdorff 38 data analyst 5'8" 158 x
Lucy Xi 49 professor 5'9" 161 x
Yolanda Zachary 58 secretary 5'10" 164 x
single_column:
Anna Barbara 35 nurse 5'3" 129 Moderate
Catherine Do 45 physicist 5'5" 135 Active
Eric Frederick 28 teacher 5'5" 140 Moderate
Gabriel Hernandez 55 surgeon 5'7" 150 Sedentary
Ivy Joo 31 engineer 5'2" 126 Sedentary
Kelly Marks 21 student 5'4" 132 Active
Nancy Owens 60 immunologist 5'8" 170 Sedentary
Patricia Qin 36 dental assistant 4'11" 110 Sedentary
Roderick Stevenson 51 bus driver 5'6" 160 Sedentary
Tracy Umfreville 42 audiologist 5'7" 156 Sedentary
Victoria Wolfeschlegelsteinhausenbergerdorff 38 data analyst 5'8" 158 Moderate
Lucy Xi 49 professor 5'9" 161 Moderate
Yolanda Zachary 58 secretary 5'10" 164 Sedentary
teste teste 22 worker 5'8" 110 Active

Related

TypeError: slice indices must be integers or None or have an __index__ method error in scraping a help page

I created python script to scrape facebook help page. I wanted to scrape cms_object_id, cmsID, name. so these values are in a script tag then firstly tried to find all <script> tags then tried to iterate over this and then there is __bbox inside the tags which contains the values wanted to scrape.
so this is my script:
import json
import requests
import bs4
from Essentials import Static
class CmsIDs:
def GetIDs():
# cont = requests.get(""https://www.facebook.com:443/help"", headers=Static.headers) # syntax error
cont = requests.get("https://www.facebook.com:443/help", headers=Static.headers)
soup = bs4.BeautifulSoup(cont.content, "html5lib")
text = soup.find_all("script")
start = ""
txtstr = ""
for i in range(len(text)):
mystr = text[i]
# mystr = text[i]
print("this is: ", mystr.find('__bbox":'))
if text[i].get_text().find('__bbox":') != -1:
# print(i, text[i].get_text())
txtstr += text[i].get_text()
start = text[i].get_text().find('__bbox":') + len('__bbox":')
print('start:', start)
count = 0
for end, char in enumerate(txtstr[start:], start):
if char == '{':
count += 1
if char == '}':
count -= 1
if count == 0:
break
print('end:', end)
# --- convert JSON string to Python structure (dict/list) ---
data = json.loads(txtstr[start:end+1])
# pp.pprint(data)
print('--- search ---')
CmsIDs.search(data)
# --- use recursion to find all 'cms_object_id', 'cmsID', 'name' ---
def search(data):
if isinstance(data, dict):
found = False
if 'cms_object_id' in data:
print('cms_object_id', data['cms_object_id'])
found = True
if 'cmsID' in data:
print('cmsID', data['cmsID'])
found = True
if 'name' in data:
print('name', data['name'])
found = True
if found:
print('---')
for val in data.values():
CmsIDs.search(val)
if isinstance(data, list):
for val in data:
CmsIDs.search(val)
if __name__ == '__main__':
CmsIDs.GetIDs()
the page contains cms_object_id, cmsID, name. so wanted to scrape all these 3 values but I am getting an error:
for end, char in enumerate(txtstr[start:], start):
TypeError: slice indices must be integers or None or have an __index__ method
so how can I solve this error and reach ultimate goal?
Note: Since I'm unfamiliar with and failed to install Essentials, and also because ""https://www.facebook.com:443/help"" raises a syntax error (there should only be one quote on each side of the string), I changed the requests line in my code.
cont = requests.get('https://www.facebook.com:443/help', headers={'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'})
TypeError: slice indices must be integers or None or have an __index__ method
You've initiated start as a string [start = ""] and it needs to be an integer. Unless the if text[i].get_text().find('__bbox":') != -1.... block is entered, start remains a string.
if you just want to not get this error, you could just exit the program if start hasn't been updated [indicating that __bbox": wasn't found in any of the script tags].
print('start:', start)
if start == "":
print('{__bbox":} not found')
return
count = 0
But that still leaves the problem of __bbox": not being found; I'm not sure why, but the issue is resolved for me if I don't use the html5lib parser - just changing to BeautifulSoup(cont.content) resolved this issue.
# soup = bs4.BeautifulSoup(cont.content, "html5lib")
soup = bs4.BeautifulSoup(cont.content) # don't define the parser
# soup = bs4.BeautifulSoup(cont.content, "html.parser") # you could also try other parsers
Other suggestions
Your code will probably work without these, but you might want to consider these suggested improvements for error-handling:
Filter the script Tags
If the text ResultSet only has script tags that contain __bbox":, you avoid looping unnecessarily through the 100+ other scripts, and you won't have to check with if....find('__bbox":') anymore.
text = soup.find_all(lambda t: t.name == 'script' and '"__bbox"' in t.get_text())
for mystr in [s.get_text() for s in text]:
print("this is: ", mystr.find('__bbox":'))
txtstr += mystr
start = mystr.find('__bbox":') + len('__bbox":')
Initiate end
You should initiate the end variable [like end = 0] before the for end, char ... loop because you're using it after the loop as well.
print('end:', end)
data = json.loads(txtstr[start:end+1])
If txtstr[start:] is empty somehow, these lines will raise error/s since end would not be defined yet.
Use a JavaScript Parser
This will make the previous suggestions unnecessary, but as it is json.loads will raise an error if txtstr[start:end+1] is empty somehow, or if it contains any unpaired [and likely escaped] }or{. So, it might be more reliable to use a parser rather than just trying to walk through the string.
I have this function that uses slimit to find values from strings containing JavaScript code. (It's far from perfect, but it seems to for this script, at least.) GetIDs() could be re-written as below.
# import json, requests, bs4, slimit
# from slimit.visitors import nodevisitor
# def findObj_inJS... ## PASTE FROM https://pastebin.com/UVcLniSG
# class CmsIDs:
def GetIDs():
# cont=requests.get('https://www.facebook.com:443/help',headers=Static.headers)
cont = requests.get('https://www.facebook.com:443/help', headers={
'accept': ';'.join(
[ 'text/html,application/xhtml+xml,application/xml',
'q=0.9,image/avif,image/webp,image/apng,*/*',
'q=0.8,application/signed-exchange',
'v=b3', 'q=0.9' ])})
## in case of request errors ##
try: cont.raise_for_status()
except Exception as e:
print('failed to fetch page HTML -', type(e), e)
return
print('fetched', cont.url, 'with', cont.status_code, cont.reason)
soup = bs4.BeautifulSoup(cont.content)
scrCond = lambda t: t.name == 'script' and '"__bbox"' in t.get_text()
jScripts = [s.get_text() for s in soup.find_all(scrCond)]
print(f'Found {len(jScripts)} script tags containing {{"__bbox"}}')
data = [findObj_inJS(s,'"__bbox"') for s in jScripts]
print('--- search ---')
CmsIDs.search(data)
# def search(data)....
Return the Data
This isn't for error-handling, but if you return the data printed by CmsIDs.search you could save it for further use.
def search(data):
rList, dKeys = [], ['cms_object_id', 'cmsID', 'name']
if isinstance(data, dict):
dObj = {k: data[k] for k in dKeys if k in data}
rList += [dObj] if dObj else []
for k, v in dObj.items(): print(k, v)
if dObj: print('---')
for val in data.values(): rList += CmsIDs.search(val)
if isinstance(data, list):
for val in data: rList += CmsIDs.search(val)
return rList
The printed result will be the same as before, but if you change the last line of GetIDs to
return CmsIDs.search(data)
and then define a variable cmsList = CmsIDs.GetIDs() then cmsList will be a list of dctionaries, which you could then [for example] save to csv with pandas and view as a table on a spreadsheet.
# import pandas
pandas.DataFrame(cmsList).to_csv('CmsIDs_GetIDs.csv', index=False)
or print the markdown for the table [of the results I got] below
print(pandas.DataFrame(cmsList, dtype=str).fillna('').to_markdown())
[index]
cms_object_id
name
cmsID
0
Використання Facebook
1
570785306433644
Створення облікового запису
2
396528481579093
Your Profile
3
Додати й редагувати інформацію у своєму профілі
1017657581651994
4
Ваші основна світлина й обкладинка
1217373834962306
5
Поширення дописів у своєму профілі та керування ними
1640261589632787
6
Усунення проблем
191128814621591
7
1540345696275090
Додавання в друзі
8
Додавання друзів
246750422356731
9
Люди, яких ви можете знати
336320879782850
10
Control Who Can Friend and Follow You
273948399619967
11
Upload Your Contacts to Facebook
1041444532591371
12
Видалення з друзів чи блокування користувача
1000976436606344
13
312959615934334
Facebook Dating
14
753701661398957
Ваша головна сторінка
15
How Feed Works
1155510281178725
16
Control What You See in Feed
964154640320617
17
Like and React to Posts
1624177224568554
18
Пошук
821153694683665
19
Translate Feed
1195058957201487
20
Memories
1056848067697293
21
1071984682876123
Повідомлення
22
Надсилання повідомлень
487151698161671
23
Переглянути повідомлення й керувати ними
1117039378334299
24
Поскаржитися на повідомлення
968185709965912
25
Відеовиклики
287631408243374
26
Fix a Problem
1024559617598844
27
753046815962474
Reels
28
Watching Reels
475378724739085
29
Creating Reels
867690387846185
30
Managing Your Reels
590925116168623
31
862926927385914
Розповіді
32
Як створити розповідь і поширити її
126560554619115
33
View and Reply to Stories
349797465699432
34
Page Stories
425367811379971
35
1069521513115444
Світлини й відео
36
Світлини
1703757313215897
37
Відео
1738143323068602
38
Going Live
931327837299966
39
Albums
490693151131920
40
Додавання позначок
267689476916031
41
Усунення проблеми
507253956146325
42
1041553655923544
Відео у Watch
43
Перегляд шоу та відео
401287967326510
44
Fix a Problem
270093216665260
45
2402655169966967
Gaming
46
Gaming on Facebook
385894640264219
47
Платежі в іграх
248471068848455
48
282489752085908
Сторінки
49
Interact with Pages
1771297453117418
50
Створити сторінку й керувати нею
135275340210354
51
Імена й імена користувачів
1644118259243888
52
Керування налаштуваннями сторінки
1206330326045914
53
Customize a Page
1602483780062090
54
Publishing
1533298140275888
55
Messaging
994476827272050
56
Insights
794890670645072
57
Banning and Moderation
248844142141117
58
Усунути проблему
1020132651404616
59
1629740080681586
Групи
60
Join and Choose Your Settings
1210322209008185
61
Post, Participate and Privacy
530628541788770
62
Create, Engage and Manage Settings
408334464841405
63
Керування групою для адміністраторів
1686671141596230
64
Community Chats
3397387057158160
65
Pages in Groups
1769476376397128
66
Fix a Problem
1075368719167893
67
1076296042409786
Події
68
Create and Manage an Event
572885262883136
69
View and Respond to Events
1571121606521970
70
Facebook Classes
804063877226739
71
833144153745643
Fundraisers and Donations
72
Creating a Fundraiser
356680401435429
73
Пожертва в рамках збору коштів
1409509059114623
74
Особисті збори коштів
332739730519432
75
For Nonprofits
1640008462980459
76
Fix a Problem
2725517974129416
77
1434403039959381
Meta Pay
78
Платежі в іграх
248471068848455
79
Payments in Messages
863171203733904
80
Пожертва в рамках збору коштів
1409509059114623
81
Квитки на заходи
1769557403280350
82
Monetization and Payouts
1737820969853848
83
1713241952104830
Marketplace
84
Як працює Marketplace
1889067784738765
85
Buying on Marketplace
272975853291364
86
Продаж на Marketplace
153832041692242
87
Sell with Shipping on Marketplace
773379109714742
88
Using Checkout on Facebook
1411280809160810
89
Групи з купівлі й продажу
319768015124786
90
Get Help with Marketplace
1127970530677256
91
1642635852727373
Додатки
92
Manage Your Apps
942196655898243
93
Видимість і конфіденційність додатка
1727608884153160
94
866249956813928
Додатки Facebook для мобільних пристроїв
95
Додаток для Android
1639918076332350
96
iPhone and iPad Apps
1158027224227668
97
Facebook Lite App
795302980569545
98
273947702950567
Спеціальні можливості
99
Керування обліковим записом
100
1573156092981768
Вхід і пароль
101
Вхід в обліковий запис
1058033620955509
102
Змінення пароля
248976822124608
103
Виправлення проблеми із входом
283100488694834
104
Завантаження посвідчення особи
582999911881572
105
239070709801747
Налаштування облікового запису
106
Як змінити налаштування облікового запису
1221288724572426
107
Ваше ім’я користувача
1740158369563165
108
Спадкоємці
991335594313139
109
1090831264320592
Імена у Facebook
110
1036755649750898
Сповіщення
111
Push, Email and Text Notifications
530847210446227
112
Виберіть, про що отримувати сповіщення
269880466696699
113
Усунення проблем
1719980288275077
114
109378269482053
Налаштування реклами
115
Як працює реклама у Facebook
516147308587266
116
Контроль реклами, яку ви бачите
1075880512458213
117
Ваша інформація та реклама у Facebook
610457675797481
118
1701730696756992
Доступ до вашої інформації та її завантаження
119
250563911970368
Деактивація або видалення облікового запису
120
Конфіденційність і безпека
121
238318146535333
Ваша конфіденційність
122
Керуйте тим, хто може переглядати контент, який ви поширюєте у Facebook
1297502253597210
123
Керування своїми дописами
504765303045427
124
Control Who Can Find You
1718866941707011
125
592679377575472
Безпека
126
Джерела щодо боротьби з жорстоким поводженням
726709730764837
127
Ресурси з допомоги для протидії самогубству та самоушкодженню
1553737468262661
128
Crisis Response
141874516227713
129
Ресурси з правил безпеки для допомоги батькам
1079477105456277
130
Інформація для правоохоронних органів
764592980307837
131
235353253505947
Захист облікового запису
132
Функції безпеки та поради з її забезпечення
285695718429403
133
Сповіщення про вхід і двоетапна перевірка
909243165853369
134
Уникайте спаму та шахрайства
1584206335211143
135
236079651241697
Безпека під час здійснення покупок
136
Розпізнавання шахрайства
1086141928978559
137
Уникнення шахрайства
2374002556073992
138
Купівля на Marketplace
721562085854101
139
Поради щодо безпечної купівлі
123884166448529
140
Купуйте впевнено
1599248863596914
141
Політики та скарги
142
1753719584844061
Скарга на порушення
143
Як поскаржитися на щось?
1380418588640631
144
Don't Have an Account?
1723400564614772
145
1126628984024935
Як повідомити про проблему у Facebook
146
186614050293763
Being Your Authentic Self on Facebook
147
1561472897490627
Повідомлення про порушення конфіденційності
148
1216349518398524
Зламані та фальшиві облікові записи
149
275013292838654
Керування обліковим записом померлої людини
150
About Memorialized Accounts
1017717331640041
151
Request to Memorialize or Remove an Account
1111566045566400
152
399224883474207
Інтелектуальна власність
153
Авторське право
1020633957973118
154
Торговельна марка
507663689427413
155
1735443093393986
Про наші політики

dataframe put must be a unicode string, not 0, how give the string not the dataframe

i try to manipulate some dataframe and i did a function to calculate the distance between 2 cities.
def find_distance(A,B):
key = '0377f0e6b42a47fe9d30a4e9a2b3bb63' # get api key from: https://opencagedata.com
geocoder = OpenCageGeocode(key)
result_A = geocoder.geocode(A)
lat_A = result_A[0]['geometry']['lat']
lng_A = result_A[0]['geometry']['lng']
result_B = geocoder.geocode(B)
lat_B = result_B[0]['geometry']['lat']
lng_B = result_B[0]['geometry']['lng']
return int(geodesic((lat_A,lng_A), (lat_B,lng_B)).kilometers)
this is my dataframe
2 32 Mulhouse 1874.0 2 797 16.8 16,3 € 10.012786
13 13 Saint-Étienne 1994.0 3 005 14.3 13,5 € 8.009882
39 39 Roubaix 2845.0 2 591 17.4 15,0 € 6.830968
27 27 Perpignan 2507.0 3 119 15.1 13,3 € 6.727255
40 40 Tourcoing 3089.0 2 901 17.5 15,3 € 6.327547
25 25 Limoges 2630.0 2 807 14.2 12,5 € 6.030424
20 20 Le Mans 2778.0 3 202 14.4 12,3 € 5.789559
there is my code:
def clean_text(row):
# return the list of decoded cell in the Series instead
return [r.decode('unicode_escape').encode('ascii', 'ignore') for r in row]
def main():
inFile = "prix_m2_france.xlsx" #On ouvre l'excel
inSheetName = "Sheet1" #le nom de l excel
cols = ['Ville', 'Prix_moyen', 'Loyer_moyen'] #Les colomnes
df =(pd.read_excel(inFile, sheet_name = inSheetName))
df[cols] = df[cols].replace({'€': '', ",": ".", " ": "", "\u202f":""}, regex=True)
# df['Prix_moyen'] = df.apply(clean_text)
# df['Loyer_moyen'] = df.apply(clean_text)
df['Prix_moyen'] = df['Prix_moyen'].astype(float)
df['Loyer_moyen'] = df['Loyer_moyen'].astype(float)
# df["Prix_moyen"] += 1
df["revenu"] = (df['Loyer_moyen'] * 12) / (df["Prix_moyen"] * 1.0744) * 100
# df['Ville'].replace({'Le-Havre': 'Le Havre', 'Le-Mans': 'Le Mans'})
df["Ville"] = df['Ville'].replace(['Le-Havre', 'Le-Mans'], ['Le Havre', 'Le Mans'])
df["distance"] = find_distance("Paris", df["Ville"])
df2 = df.sort_values(by = 'revenu', ascending = False)
print(df2.head(90))
main()
df["distance"] = find_distance("Paris", df["Ville"]) fails and give me this error:
opencage.geocoder.InvalidInputError: Input must be a unicode string, not 0 Paris
1 Marseille
2 Lyon
3 T
I imagine it as a loop where i will put the distance between paris and the city but i guess it take all the dataframe on my first value.
Thanks for your help
(Edit, i just pasted a part of my dataframe)
You can try something like :
df["distance"] = [find_distance("Paris", city) for city in df["Ville"]]

Unable to format output in Python correctly

I am unable to format in Python correctly. Below is what my list looks like. I am not sure why the spacing is off on some of the fields. Below is my code as well as a snip of how it reads.
def main():
golf_file = open('golf.txt', 'r') #open file
first_name = golf_file.readline() #read first line
print('First Name\tLast Name\tHandicap\tGolf Score\tOver, Under or Par') #print headings
while first_name != '': #while statement for loop
last_name = golf_file.readline()
handicap = golf_file.readline()
golf_score = golf_file.readline()
#stripping newline from each string
first_name = first_name.rstrip('\n')
last_name = last_name.rstrip('\n')
handicap = handicap.rstrip('\n')
golf_score = golf_score.rstrip('\n')
handicap_num = float(handicap)
golfscore_num = int(golf_score)
#if statement to determine if golf score is over, under or par
if golfscore_num == 80:
OverUnderPar = ('Par')
elif golfscore_num < 80:
OverUnderPar = ('Under Par')
else:
OverUnderPar = ('Over Par')
#print info with two tabs for positioning.
print( first_name, '\t''\t', last_name, '\t''\t', handicap_num, '\t', '\t', golfscore_num, '\t', '\t', OverUnderPar)
first_name = golf_file.readline()
golf_file.close() #close file
main()
First Name Last Name Handicap Golf Score Over, Under or Par
Andrew Marks 11.2 72 Under Par
Betty Franks 12.8 89 Over Par
Connie William 14.6 92 Over Par
Donny Ventura 9.9 78 Under Par
Ernie Turner 10.1 81 Over Par
Fred Smythe 8.1 75 Under Par
Greg Tucker 7.2 72 Under Par
Henry Zebulon 8.3 83 Over Par
Ian Fleming 4.2 72 Under Par
Jan Holden 7.7 84 Over Par
Kit Possum 7.9 79 Under Par
Landy Bern 10.3 93 Over Par
Mona Docker 11.3 98 Over Par
Kevin Niles 7.1 80 Par
Pam Stiles 10.9 87 Over Par
Russ Hunt 5.6 73 Under Par
If the name is too big(over 6 char) the tab will be moved one more down. You can either check if the name is too big and move it down own tab.By using something like
numTabs = '\t' * (2-len(last_name)//6
Or a better approach would use something like str.format as mentioned by #Michael Butscher in the comments.

Need help writing a regex

I need a regex that reads a file with blast information.
The file looks like:
****ALIGNMENT****
Sequence: gi|516137619|ref|WP_017568199.1| hypothetical protein [Nocardiopsis synnemataformans]
length: 136
E_value: 8.9548e-11
score: 153.0
bit_score: 63.5438
identities: 35
positives: 42
gaps: 6
align_length: 70
query: MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match: MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
sbjct: MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
I already have a code, but in this file there is some extra data. The variable names with the corresponding name in this example, are:
hitsid = 516137619
protein = hypothetical protein
organism = Nocardiopsis synnemataformans
length = 136
evalue = 8.9548e-11
score = 153.0
bitscore = 63.5438
identities = 35
positives = 42
gaps = 6
query = MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match = MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
subject = MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
I'm looking for something like this, this is a regex I already got, but now there are some extra things added:
p = re.compile(r'^Sequence:[^|]*\|(?P<hitsid>[^|]*)\|\S*\s*(?P<protein>[^][]*?)\s*\[(?P<organism>[^][]*)][\s\S]*?\nE-value:\s*(?P<evalue>.*)', re.MULTILINE)
File looks like:
****ALIGNMENT****
Sequence: gi|516137619|ref|WP_017568199.1| hypothetical protein [Nocardiopsis synnemataformans]
length: 136
E_value: 8.9548e-11
score: 153.0
bit_score: 63.5438
identities: 35
positives: 42
gaps: 6
align_length: 70
query: MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match: MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
sbjct: MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
****ALIGNMENT****
Sequence: gi|962700925|ref|BC_420072443.1| Protein crossbronx-like [Nocardiopsis synnemataformans]
length: 136
E_value: 8.9548e-11
score: 153.0
bit_score: 63.5438
identities: 35
positives: 42
gaps: 6
align_length: 70
query: MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match: MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
sbjct: MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
****ALIGNMENT****
Sequence: gi|516137619|ref|WP_017568199.1| hypothetical protein [Nocardiopsis synnemataformans]
length: 136
E_value: 8.9548e-11
score: 153.0
bit_score: 63.5438
identities: 35
positives: 42
gaps: 6
align_length: 70
query: MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match: MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
sbjct: MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
You no need regexp:
parsed = []
raw_parts = open('tmp9.txt','r').read().split('****ALIGNMENT****')
for raw_part in raw_parts:
parsed_dict = {}
for line in raw_part.split('\n'):
try:
key,value = line.split(':')
parsed_dict[key] = value.strip()
except:
pass
parsed.append(parsed_dict)
print(parsed)

Appending data to text file dependant on if statment [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking for code must demonstrate a minimal understanding of the problem being solved. Include attempted solutions, why they didn't work, and the expected results. See also: Stack Overflow question checklist
Closed 9 years ago.
Improve this question
I am trying to get a text file automatically updating new class details.
# Example variables
completed = Yes
class = 13A
time = 11:00
if completed:
# Check data class and time variables against text file and if they don't exist then add them, if they do exist do nothing.
My text files look like:
13A
11:00
Top Students: Joe Smith, Tom Clarke, Jenna Sole
Top 3
Attendance: 98.5%
Class Score: 54
Yes
13B
11:10
Top Students: Anni Moy, Jessica Longate, Phillip Tome
T3
Attendance: 98.5%
Class Score: 54
Yes
14A
11:10
Top Students: John Doe, John Smith, Sam Ben
T2
Attendance: 98.5%
Class Score: 54
Yes
Does any one know how this can be done, I would greatly appreciate an example if anyone could be so helpful.
Here's the code that parses the text file and dumps them into variables.
Code below illustrates how to parse your text file using regex.
import re
fp = open('class_data.txt')
lines = fp.read(-1)
fp.close()
records = re.split('\n\s*\n', lines) #Split all the records
#print len(records)
for record in records:
data = record.split('\n')
classid, classtime, top_students = data[0], data[1], re.split('^[A-Za-z ]*:', data[2])[1].split(',')
attendance, score, completed = re.split('^[A-Za-z ]*:', data[4])[1], re.split('^[A-Za-z ]*:', data[5])[1], data[6]
print classid, classtime, top_students, len(top_students), attendance, score, completed
Print statement outputs
13A 11:00 [' Joe Smith', ' Tom Clarke', ' Jenna Sole'] 3 98.5% 54 Yes
13B 11:10 [' Anni Moy', ' Jessica Longate', ' Phillip Tome'] 3 98.5% 54 Yes
14A 11:10 [' John Doe', ' John Smith', ' Sam Ben'] 3 98.5% 54 Yes
Now that you have your text file converted into variables, We can now add the code to check whether a class is finished and if the record is already contained in the file else add it
import re
fp = open('class_data.txt')
lines = fp.read(-1)
fp.close()
completed = Yes
class = 13A
time = 11:00
isClassRecordFound = False
records = re.split('\n\s*\n', lines) #Split all the records
#print len(records)
for record in records:
data = record.split('\n')
classid, classtime, top_students = data[0], data[1], re.split('^[A-Za-z ]*:', data[2])[1].split(',')
attendance, score, completed = re.split('^[A-Za-z ]*:', data[4])[1], re.split('^[A-Za-z ]*:', data[5])[1], data[6]
print classid, classtime, top_students, len(top_students), attendance, score, completed
if (completed):
if (classid == class) and (time == classtime):
isClassRecordFound = True
break;
if not isClassRecordFound:
with open("class_data.txt", "a") as myfile:
myfile.write(class + '\n' + time)

Categories