I'm trying to create a search but I'm facing an error, according to some tests I can search for 'name', but I would like to search for 'number_order', does anyone have a solution? Remembering that 'number_order' cannot be changed inside the dataframe EX: 'number_order' : [202204000001] -> 'number_order' : ['202204000001']
import pandas as pd
import matplotlib.pyplot as plt
d = {'number_order' : [202204000001, 202204000002, 202204000003, 202204000004,
202204000005, 202204000006],
'client' : ['Roger Nascimento', 'Rodrigo Peixato', 'Pedro',
'Rafael', 'Maria', 'Emerson'],
'value' : ['120', '187.74', '188.7', '300', '563.2', '198.0']
}
df = pd.DataFrame(data = d)
src_field_data = '202004'
filtered_data = df['number_order']
filtered_data = df.loc[filtered_data.str.contains(f'^{src_field_data}', case = False)]
print(f'number_order FILTERED {filtered_data}\n')
I want to search like this example below, using only a part of the text:
import pandas as pd
import matplotlib.pyplot as plt
d = {'number_order' : [202204000001, 202204000002, 202204000003, 202204000004,
202204000005, 202204000006],
'client' : ['Roger Nascimento', 'Rodrigo Peixato', 'Pedro',
'Rafael', 'Maria', 'Emerson'],
'value' : ['120', '187.74', '188.7', '300', '563.2', '198.0']
}
df = pd.DataFrame(data = d)
src_field_data = 'R'
filtered_data = df['client']
filtered_data = df.loc[filtered_data.str.contains(f'^{src_field_data}', case = False)]
print(f'number_order FILTERED {filtered_data}\n')
Convert values to strings:
filtered_data = df.loc[filtered_data.astype(str).str.contains(f'^{src_field_data}', case = False)]
Related
I have the following DataFrame:
import pandas as pd
df = pd.DataFrame({'Status': ['','To Do', '','Completed', 'To Do', 'In Progress', 'Completed'],
'Date': ['','9/1/2022','','12/5/2019','8/12/2020','4/19/2020','12/31/2018']})
I want to blank out the Date column for those rows where the Status is anything but "Completed". Using .loc, I am getting an error.
df = df.loc[df['Status'] != 'Completed', 'Date'] = ''
AttributeError: 'str' object has no attribute 'loc'
There is a typo here:
df = df.loc[df['Status'] != 'Completed', 'Date'] = ''
Should be
df.loc[df['Status'] != 'Completed', 'Date'] = ''
I'm trying to take a spreadsheet input and display only the urls that are CMS related (wordpress|Wordpress|WordPress|Drupal|drupal|Joomla|joomla). I'm trying to get the output to be "technologies" and then the "url" associated (grouped) to those urls.
Link to data file
output
Code is:
import pandas as pd
import numpy as np
dataset="ASD-example.xlsx"
term_cms = 'wordpress|Wordpress|WordPress|Drupal|drupal|Joomla|joomla'
df = pd.read_excel((dataset), sheet_name="HTTPX")
df['technology_count'] = df.groupby('technologies')['url'].transform('count')
df.drop(['timestamp', 'request', 'response-header', 'scheme', 'port', 'body-sha256','header-sha256', 'a', 'cnames', 'input', 'location', 'error', 'response-body', 'content-type','method', 'host', 'content-length', 'chain-status-codes', 'status-code', 'tls-grab', 'csp', 'vhost','websocket', 'pipeline', 'http2', 'cdn', 'response-time', 'chain', 'final-url', 'failed','favicon-mmh3', 'lines', 'words','path','webserver'],inplace=True,axis=1)
df[df['technologies'].str.contains(term_cms, na=False)]
pivot1 = pd.pivot_table(df, index=['technologies', 'url'], columns=None, fill_value=0)
print(pivot1)
I cleaned your code a bit to make it more readable to get the output you want.
term_cms = ["wordpress", "drupal", "joomla"]
# remove square brackets and lowercase all names
df['technologies'] = df['technologies'].str.strip('[]')
df['technologies'] = df['technologies'].str.lower()
# include only needed technologies
mask = df['technologies'].isin(term_cms)
df = df[mask]
# groupby and count
df = df.groupby(['technologies', 'url']).size().reset_index(name='technology_count')
Output:
technologies URL technology_count
0 joomla https://testcom123. 1
1 Wordpress https://test.com:443 1
fellow developers in the StackOverflow.
I have string data in
'key=apple; age=10; key=boy; age=3'
How can we convert it into the pandas' data frame such that key and age will be the header and all the values in the column?
key age
apple 10
boy 3
Try this:
import pandas as pd
data = 'key=apple; age=10; key=boy; age=3'
words = data.split(";")
key = []
age = []
for word in words:
if "key" in word:
key.append(word.split("=")[1])
else:
age.append(word.split("=")[1])
df = pd.DataFrame(key, columns=["key"])
df["age"] = age
print(df)
You can try this:
import pandas as pd
str_stream = 'key=apple; age=10; key=boy; age=3'
lst_kv = str_stream.split(';')
# lst_kv => ['key=apple', ' age=10', ' key=boy', ' age=3']
res= [{s.split('=')[0].strip(): s.split('=')[1] for s in lst_kv[i:i+2]}
for i in range(len(lst_kv)//2)
]
df = pd.DataFrame(res)
df
Output:
key age
0 apple 10
1 boy 10
More explanation for one line res :
res = []
for i in range(len(lst_kv)//2):
dct_tmp = {}
for s in lst_kv[i:i+2]:
kv = s.split('=')
dct_tmp[kv[0].strip()] = kv[1]
res.append(dct_tmp)
res
Output:
[{'key': 'apple', 'age': '10'}, {'age': '10', 'key': 'boy'}]
My current database is:
# bibliotecas necessárias
import pandas as pd
dict_noticia = {'nome_adm': ['CC Brasil',
'ABC Futuro Esporte',
'Tabuao'],
'noticia': ["['folha', 'paulo', 'https', 'east', 'amazonaws', 'multclipp', 'arquivos', 'noticias', 'pdf', 'jpg', 'mônica', 'bergamo', 'longo', 'tempo']",
"['coluna', 'estadão']",
"['flamengo', 'futebol','melhor','campeao','é']"]
}
df = pd.DataFrame(dict_noticia)
df
I need a new column with the lemmas of the "news" column.
The script below gives error:
import stanza
nlp_stanza = stanza.Pipeline(lang='pt', processors='tokenize,mwt,pos,lemma')
def f_lematizacao_stanza(df,column_name,new_column_name):
df[new_column_name] = df[column_name].apply(lambda x: ([w.lemma_ for w in nlp_stanza(row)]))
return df
f_lematizacao_stanza(data,'noticia','noticia_lema')
NameError: name 'row' is not defined
How to solve
Thank you in advance.
You have not defined the variable row. You need to use x:
def f_lematizacao_stanza(df,column_name,new_column_name):
df[new_column_name] = df[column_name].apply(lambda x: ([w.lemma_ for w in nlp_stanza(x)]))
return df
I'm learning python and machine learning and trying to create a very simple csv from synthetic data.
Can anyone help me tweak this to get it to work in PyCharm?
I'm trying to input a random value from the selection in each column.
Much appreciated
import random
import pandas as pd
marriage_status = {'single', 'married', 'divorced', 'widowed', 'complicated'}
children = {'yes', 'no'}
employment = {'employed', 'self_employed', 'unemployed', 'student'}
income_abroad = {'yes', 'no'}
gender = {'M', 'F'}
response = {'refund', 'payment'}
columns = ['marriage_status', 'children', 'employment',
'income_abroad', 'age', 'gender', 'income', 'expenses', 'response']
df = pd.DataFrame(columns=columns)
for i in range(1000):
marriage_status = random.choice(list(marriage_status))
children = random.choice(list(children))
employment = random.choice(list(employment))
income_abroad = random.choice(list(income_abroad))
gender = random.choice(list(gender))
response = random.choice(list(response))
age = random.randint(18, 70)
income = random.randint(0, 100000)
expenses = random.randint(0, 10000)
df = [marriage_status, children, employment, income_abroad, age, gender, income, expenses, response]
df[6].to_csv('taxfix_data.csv')
index = False
If you're going to use pandas the easiest way is to do it like this
import pandas as pd
df = pd.DataFrame(
{"marriage_status" : ['single' ,'married', 'divorced', 'widowed', 'complicated],
"children" : ['yes', 'no'],
"employment" : ['employed', 'self_employed', 'unemployed', 'student'],
"gender" : ['M', 'F'],
"response" : ['refund', 'payment'],
"income_abroad" : ['yes', 'no']}
index = [1, 2, 3])
Also here's a really useful cheatsheet for pandas https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf