EDIT 2, 9/1 See my answer below!
Pretty new at Python and Pandas here. I've got a script here that uses a for loop to query my database using each line in my list. That all works great, but I can't figure out how to build a data frame from the results of that loop. Any and all pointers are welcome!
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
list1 = []
brands = my_query('prod', mysql_query)
print "Writing CSV..."
#Create DF and CSV
for row in brands:
list1.append({"URL":row['URL'],"Name":['Name'],"ID":['ID']})
if brands.shape == (3,0):
df1 = pd.DataFrame(data = brands, columns=['URL','Name','ID'])
output = df1.to_csv('ongoing.csv',index=False)
EDIT 8/30
Here is my edit, attempting to use zyxue's method:
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
brands = my_query('prod', mysql_query)
print "Writing CSV..."
#Create DF and CSV
records = []
for row in brands:
records.append({"URL":row['URL'],"Name":['Name'],"ID":['ID']})
if brands.shape == (3,0):
records.append(dict(zip(brands, ['URL', 'Name', 'ID'])))
df1 = pd.DataFrame.from_records(records)
output = df1.to_csv('ongoing.csv', index=False)
but this only returns a blank CSV. I'm sure I'm applying it wrong.
records = []
for row in brands:
# if brands.shape == (3,0):
# records.append(dict(zip(brands, ['URL', 'Name', 'ID'])))
# update bug fix:
if row.shape == (3,0):
records.append(dict(zip(row, ['URL', 'Name', 'ID'])))
df1 = pd.DataFrame.from_records(records)
output = df1.to_csv('ongoing.csv', index=False)
# ref:
# >>> pd.DataFrame.from_records([{'a': 1, 'b':2}, {'a': 11, 'b': 22}])
# a b
# 0 1 2
# 1 11 22
Okay, I figured it out, and I thought I should post the working script. #zyxue was pretty much right.
source = open('urls.txt')
key_values = ("http://",""), ("https://",""), ("www.",""), ("\n","")
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
records = []
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
brands = my_query('prod', mysql_query)
#Append results to dict (records)
for row in brands:
records.append({"URL":row['URL'],"Name":row['Name'],"ID":row['ID']})
#Create DataFrame
df = pd.DataFrame.from_dict(records)
#Create CSV
output = df.to_csv('ongoing.csv',index=False)
Essentially, I needed to layer the second for loop under the first and create the 'records' dictionary before the looping began. This causes an append to the dictionary for every line in 'source'. Seems like a pretty simple concept now!
Related
I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.
need to save them into different data frames
query = '''select name
from my_table
where class = {}
and student_number > {}
and student_number <= {} +10
group by name'''
inputs = list(range(0, 100,10))
classes = [1,2,3,4]
the expected result is running these batches for each class individually. e.g df_class1, df_class2 df_class3, df_class4
query = '''
select name from my_table where class = {} and student_number >
{} and student_number <= {} +50 group by name'''
inputs = list(range(0, 100,10))
classes = [1,2,3,4]
not sure on this part ##for i in inputs: for c in classes: query.format(c, i, i)##
results = pd.DataFrame() for input, query in queries.items():
res = my_db.execute(query)
results = results.append(pd.DataFrame(res))
each results as sth like ;df_class1, df_class2 df_class3, df_class4
You can use formatted string to save the resultant dataframe for each iteration.
inputs = list(range(0, 100,10))
classes = [1,2,3,4]
for i in inputs:
for c in classes:
query.format(c, i, i)
res = my_db.execute(query)
df = pd.DataFrame(res)
df.to_csv(f'result_{i}_{c}.csv')
I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)
in my basic search form with framework django , when i enter wrong keyword of a drug dataset in my search form gets me wrong like "KeyError"
this search form work with pandas lib , so i am just want when i put word wrong do not show me error
i want to show to the user message "nothing match try something else"
this is the error when i put word not in my dataframe the word is Tramadol
this is my code
def search_recommender(request):
query = request.GET.get('q')
if query:
indices = pd.Series(df.index, index=df['drugName']).drop_duplicates()
idx = indices[query]
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[0], reverse=True)
sim_scores = sim_scores[1:6]
mov_indices = [i[0] for i in sim_scores]
gg_will = df['drugName'].iloc[mov_indices]
json = gg_will.to_json(orient='values')
else:
qs = DrugDataset.objects.all()
df = qs.to_dataframe()
json=df.filter(drugName='q')
You can use a try-except to specify that the search was unsuccesful, like:
def search_recommender(request):
query = request.GET.get('q')
if query:
indices = pd.Series(df.index, index=df['drugName']).drop_duplicates()
try:
idx = indices[query]
except KeyError:
json = 'No match, try something else'
# ...
else:
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[0], reverse=True)
sim_scores = sim_scores[1:6]
mov_indices = [i[0] for i in sim_scores]
gg_will = df['drugName'].iloc[mov_indices]
json = gg_will.to_json(orient='values')
else:
qs = DrugDataset.objects.all()
df = qs.to_dataframe()
json = df.filter(drugName='q')
How can I do the following in Python:
I have a command output that outputs this:
Datexxxx
Clientxxx
Timexxx
Datexxxx
Client2xxx
Timexxx
Datexxxx
Client3xxx
Timexxx
And I want to work this in a dict like:
Client:(date,time), Client2:(date,time) ...
After reading the data into a string subject, you could do this:
import re
d = {}
for match in re.finditer(
"""(?mx)
^Date(.*)\r?\n
Client\d*(.*)\r?\n
Time(.*)""",
subject):
d[match.group(2)] = (match.group(1), match.group(2))
How about something like:
rows = {}
thisrow = []
for line in output.split('\n'):
if line[:4].lower() == 'date':
thisrow.append(line)
elif line[:6].lower() == 'client':
thisrow.append(line)
elif line[:4].lower() == 'time':
thisrow.append(line)
elif line.strip() == '':
rows[thisrow[1]] = (thisrow[0], thisrow[2])
thisrow = []
print rows
Assumes a trailing newline, no spaces before lines, etc.
What about using a dict with tuples?
Create a dictionary and add the entries:
dict = {}
dict['Client'] = ('date1','time1')
dict['Client2'] = ('date2','time2')
Accessing the entires:
dict['Client']
>>> ('date1','time1')