Heyo, I've been working on this code:
import requests as r
import datetime
import openpyxl
import json
import matplotlib #i was gonna use this for graphing later lol
from openpyxl.chart.axis import DateAxis
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.chart import BarChart, Reference, Series, LineChart
#i dont thinks we have to do it this way, i just saw that this is how the guy on stackoverflow was doing it and didnt argue lol
try:
from openpyxl.cell import get_column_letter
except ImportError:
from openpyxl.utils import get_column_letter
from openpyxl.utils import column_index_from_string
myname = input('Enter your name: ')
def do_the_work(myname):
userLink = 'https://api.faceit.com/users/v1/nicknames/'+myname #Inputting the name given here to get the UUID of the user
print("Getting user . . . this may take a while")
reqUserLink = r.get(userLink).json() #requesting the link's JSON data
print("UUID Found; ",str(reqUserLink['payload']['id']))
reqUserList = []
reqUserList.append(str(reqUserLink['payload']['id'])) #adding it to our own list because python HATES nested stuff
print(reqUserList)
userStatsLink = ""
userStatsLink = f"https://api.faceit.com/stats/v1/stats/time/users/{reqUserList[0]}/games/csgo"
print(userStatsLink)
params = {"page": 0, "size": 30} #we have to do this because the data prevents us from having more than 30 matches a page
matches = [] #creating our own dictionary to grab only relevant data from the matches
totalRankedM = 1 #showing ranked matches (aka matches that have an ELO value)
while True:
reqStatsPage = r.get(userStatsLink, params=params) #Requesting
statsPageJSON = reqStatsPage.json()
print(statsPageJSON)
for match in statsPageJSON:
elo = match.get("elo") #elo variable of the stats
if elo is not None: #checking if elo does exist (Faceit removes this parameter from matches that have no elo)
matches.append({ #adding them to our own list with the relevant data wanted
'match_id': match["matchId"],
'played_on': match["date"],
'map': match["i1"],
'elo': match["elo"]
})
totalRankedM = totalRankedM + 1 #each time we have a match that counts elo, it is a ranked match
else:
matches.append({
'match_id': match["matchId"],
'played_on': match["date"],
'map': match["i1"],
'elo': '' #just putting nothing tbh, we replace this later to "No elo recorded" but i like the freedom
})
if len(statsPageJSON) < params['size']: #check if we went thru all the pages
break
params['page'] += 1 #changing page
print(f'Total number of matches: {len(matches)}') #print the total number of matches
print(f'Total number of ranked matches: {totalRankedM}') #print total ranked matches
matches.reverse() #since we start at the top of the pages aka most recent matches and go down from there, we reverse this to make it easier for the excel sheet graph so it's in order
workbook = openpyxl.Workbook()
worksheet = workbook.active
worksheet.append(['Match ID', 'Played On', 'Map', 'ELO', 'Total Ranked Matches']) #those are the columns
for match in matches: #adding the data from the matches=[] list to the sheet
worksheet.append([match['match_id'], match['played_on'], match['map'], match['elo']])
worksheet["E2"].value = totalRankedM
worksheet["F1"].value = 'To view the game room of a specific match, simply use this link: https://www.faceit.com/en/csgo/room/REPLACE THIS WITH MATCHID/scoreboard'
for cell in worksheet['D']: #to make it pretty in the excel sheet and get rid of errors, we convert the strings that are numbers to actual numbers in the sheet (excel considers them strings, so it throws an error when its a string number)
if isinstance(cell.value, str):
try:
number = int(cell.value)
cell.data_type = 'n'
cell.value = number
except ValueError:
pass #just ignoring if we can't make it a number
for cell in worksheet['B']: #the data we are given for the dates are in unix so we just convert
if isinstance(cell.value, int):
date = datetime.datetime.fromtimestamp(cell.value / 1000)
date_str = date.strftime('%d/%m/%Y')
cell.value = date_str
for column_cells in worksheet.columns: #this is just to make each cell size fit the length of the text in it
new_column_length = max(len(str(cell.value)) for cell in column_cells)
new_column_letter = (get_column_letter(column_cells[0].column))
if new_column_length > 0:
worksheet.column_dimensions[new_column_letter].width = new_column_length*1.23
#and we save the final product xd
workbook.save('matches.xlsx')
do_the_work(myname)
#fancy stuff
print("\n")
print("\n")
print("\n")
print("You may now look at the excel sheet.")
that basically
lets you input a name
uses faceit's API calls to get the data from said account, gets the UUID (because this is the only way you can see the stats page of a user in the link),
adds that UUID to our list (i think I made it that way when trying to figure out why I was getting the error but that part still works fine) (i think lol, still had the same error before it)
then it adds that UUID to the stats link so we can parse through it
grabs the relevant data and adds it to our own list
prints that list
adds it to an excel sheet
Now the weird part of this is that when I try to run the program, it works for the first time and loads everything correctly and adds it to the excel sheet.
But then I want to do it again with the same username (it should just overwrite) and then it stops halfway through it to show me this error:
Traceback (most recent call last):
File "C:\Users\SAM\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\models.py", line 971, in json
return complexjson.loads(self.text, **kwargs)
File "C:\Users\SAM\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Users\SAM\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\SAM\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 353, in raw_decode
obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 7 column 10 (char 107)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\SAM\Desktop\elo-scrape\test3.py", line 111, in <module>
do_the_work(myname)
File "C:\Users\SAM\Desktop\elo-scrape\test3.py", line 43, in do_the_work
statsPageJSON = reqStatsPage.json()
File "C:\Users\SAM\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\models.py", line 975, in json
raise RequestsJSONDecodeError(e.msg, e.doc, e.pos)
requests.exceptions.JSONDecodeError: Expecting property name enclosed in double quotes: line 7 column 10 (char 107)
This specific account I am loading the data from has a total of 1605 ranked matches for context
But then I try to do it with my own account (328 ranked matches) over and over and it works over and over.
Does it just stop working when we have a huge amount of data? If so what way i could improve this code? I've been spending a while trying to fix it and I'm out of ideas which is why I'm here
Related
I am trying to extract texts from PDF and compare the info, finally saving it as an excel file. But while I am running it, (the code is given below), I get the error. I have provided the whole Traceback.
`
import pdfminer
import pandas as pd
from time import sleep
from tqdm import tqdm
from itertools import chain
import slate
# List of pdf files to process
pdf_files = ['file1.pdf', 'file2.pdf']
# Create a list to store the text from each PDF
pdf1_text = []
pdf2_text = []
# Iterate through each pdf file
for pdf_file in tqdm(pdf_files):
# Open the pdf file
with open(pdf_file, 'rb') as pdf_now:
# Extract text using slate
text = slate.PDF(pdf_now)
text = text[0].split('\n')
if pdf_file == pdf_files[0]:
pdf1_text.append(text)
else:
pdf2_text.append(text)
sleep(20)
pdf1_text = list(chain.from_iterable(pdf1_text))
pdf2_text = list(chain.from_iterable(pdf2_text))
differences = set(pdf1_text).symmetric_difference(pdf2_text)
## Create a new dataframe to hold the differences
differences_df = pd.DataFrame(columns=['pdf1_text', 'pdf2_text'])
# Iterate through the differences and add them to the dataframe
for difference in differences:
# Create a new row in the dataframe with the difference from pdf1 and pdf2
differences_df = differences_df.append({'pdf1_text': difference if difference in pdf1_text else '',
'pdf2_text': difference if difference in pdf2_text else ''}, ignore_index=True)
# Write the dataframe to an excel sheet
differences_df = differences_df.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
differences_df.to_excel('differences.xlsx', index=False, engine='openpyxl')
import openpyxl
import re
# Load the Excel file into a dataframe
df = pd.read_excel("differences.xlsx")
# Create a condition to check the number of words in each cell
for column in ["pdf1_text", "pdf2_text"]:
df[f"{column}_word_count"] = df[column].str.split().str.len()
condition = df[f"{column}_word_count"] < 10
# Drop the rows that meet the condition
df = df[~condition]
for column in ["pdf1_text", "pdf2_text"]:
df = df.drop(f"{column}_word_count", axis=1)
# Save the modified dataframe to a new Excel file
df.to_excel("differences.xlsx", index=False)
This is my code, and below is the error which I am getting. Listing the whole traceback below -
Traceback (most recent call last):
File "c:\Users\lmohandas\stuff\1801pdfs\slatetrial.py", line 22, in <module>
text = slate.PDF(pdf_now)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\slate\classes.py", line 61, in __init__
self.doc = PDFDocument(self.parser, password)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 558, in __init__
self.read_xref_from(parser, pos, self.xrefs)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 789, in read_xref_from
xref.load(parser)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 242, in load
self.data = stream.get_data()
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdftypes.py", line 292, in get_data
self.decode()
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdftypes.py", line 283, in decode
data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\utils.py", line 46, in apply_png_predictor
raise ValueError("Unsupported predictor value: %d"%ft)
TypeError: %d format: a real number is required, not bytes
I'm really new to these things so I followed a geekforgeeks tutorial. I have some experience in python, but I couldn't figure out what the problem is.
This is the code.
# Import following modules
import urllib.request
import pandas as pd
from pushbullet import PushBullet
# Get Access Token from pushbullet.com
Access_token = "#.########################"
# Authentication
pb = PushBullet(Access_token)
# All pushes created by you
all_pushes = pb.get_pushes()
# Get the latest push
latest_one = all_pushes[0]
# Fetch the latest file URL link
url = latest_one['file_url']
# Create a new text file for storing
# all the chats
Text_file = "All_Chats.txt"
# Retrieve all the data store into
# Text file
urllib.request.urlretrieve(url, Text_file)
# Create an empty chat list
chat_list = []
# Open the Text file in read mode and
# read all the data
with open(Text_file, mode='r', encoding='utf8') as f:
# Read all the data line-by-line
data = f.readlines()
# Excluded the first item of the list
# first items contains some garbage
# data
final_data_set = data[1:]
# Run a loop and read all the data
# line-by-line
for line in final_data_set:
# Extract the date, time, name,
# message
date = line.split(",")[0]
tim = line.split("-")[0].split(",")[1]
name = line.split(":")[1].split("-")[1]
message = line.split(":")[2][:-0] ##### THIS IS THE LINE 53 #####
# Append all the data in a List
chat_list.append([date, tim, name, message])
# Create a dataframe, for storing
# all the data in a excel file
df = pd.DataFrame(chat_list,
columns = ['Date', 'Time',
'Name', 'Message'])
df.to_excel("BackUp.xlsx", index = False)
This is the error message I am getting.
Traceback (most recent call last):
File "d:\#adress to the file location", line 53, in <module>
message = line.split(":")[2][:-0]
IndexError: list index out of range
I have made a note at the line 53, so as I am just getting started, please excuse for any silly mistakes, point me out anything. I just want to figure this out.
Thanks in advance.🥲
I have a script that modifies data in a django app.I have data in an excel file that i process then update my models with it, some of the data is in Arabic and when i execute the script i get the following error:
Traceback (most recent call last):
File "script.py", line 77, in <module>
update_locations(path)
File "script.py", line 36, in update_locations
household.location = new_location
File "/data/envs/ve.maidea/lib/python2.7/site-packages/django/db/models/fields/related_descriptors.py", line 207, in __set__
self.field.remote_field.model._meta.object_name,
ValueError: Cannot assign "'\xd8\xa7\xd9\x84\xd8\xa8\xd8\xad\xd9\x8a\xd8\xb1\xd9\x87'": "Household.location" must be a "Location" instance.
I think the error is been raised by these Arabic characters.
here is my script:
import django
django.setup()
import sys
reload(sys) # to re-enable sys.setdefaultencoding()
sys.setdefaultencoding('utf-8')
import xlrd
from django.db import transaction
from foodnet.apps.registration.models import Household
from geo.models import Location
log_file = "/opt/cv_instances/cv1/autodeploy/branches/nboreports/maidea/egypt/data_import_files/egypt_beheira_locations.txt"
logfile_to_write = open(log_file, "w")
def process_file(path):
book = xlrd.open_workbook(path)
print("Got {0} number of sheets.".format(book.nsheets))
hh_counter = 0
for sheet_num in range(book.nsheets-1, -1, -1):
sheet = book.sheet_by_index(sheet_num)
print("Processing sheet number {0} ({1})".format(sheet_num, sheet.name))
for row_idx in range(1, sheet.nrows):
with transaction.atomic():
try:
household_name = str(sheet.row_values(row_idx)[0]).strip().replace(".0","")
# old_location = str(sheet.row_values(row_idx)[1]).strip().replace(".0","")
new_location = str(sheet.row_values(row_idx)[2]).strip().replace(".0","")
if household_name:
household = Household.objects.get(office__slug='eg-co',name=household_name)
# print(household.name, household.location)
#update new locations
household.location = new_location
household.save()
hh_counter += 1
logfile_to_write.write("Household {0} updated to location {1}".format(household, household.location))
except Household.DoesNotExist:
continue
print("Done looping and updating locations")
print("================================================================================================================================")
def delete_old_locations(path):
"""
Delete old locations no longer needed by the country office
"""
book = xlrd.open_workbook(path)
print("Got {0} number of sheets.".format(book.nsheets))
location_counter = 0
for sheet_num in range(book.nsheets-1, -1, -1):
sheet = book.sheet_by_index(sheet_num)
print("Processing sheet number {0} ({1})".format(sheet_num, sheet.name))
for row_idx in range(1, sheet.nrows):
with transaction.atomic():
try:
old_location = str(sheet.row_values(row_idx)[1]).strip().replace(".0","")
if old_location:
location = Location.objects.get(country__name="Egypt", name=old_location)
# print(location.name, location.country)
location.delete()
location_counter += 1
logfile_to_write.write("Location {0} deleted ".format(location))
except Location.DoesNotExist:
continue
print("Done looping and deleting locations")
print("================================================================================================================================")
#call the our process file method
if __name__=="__main__":
path = "/opt/cv_instances/cv1/autodeploy/branches/nboreports/maidea/egypt/data_import_files/egypt-sf-beheira-enrolments.xlsx"
process_file(path)
delete_old_locations(path)
print("Done processing file")
I kindly need advice on the best way of printing this arabic characters.Thanks in advance.
This has nothing to do with Arabic characters. As the error says, you need to assign an instance of Location there, not a string.
Hello I am a very new programmer who is self teaching Python. I have encountered a very interesting problem and need some help in creating a program for it. It goes like this
A hotel salesperson enters sales in a text file. Each line contains the following, separated by semicolons: The name of the client, the service sold (such as Dinner, Conference, Lodging, and so on), the amount of the sale, and the date of that event. Write a program that reads such a file and displays the total amount for each service category. Display an error if the file does not exist or the format is incorrect.
Prompt for the name of the file to process and issue an
error message and terminate if that file can’t be opened
Verify that each line has the correct number of items and
terminate if it does not
Verify that the dollar amount is a valid floating-‐point
number and terminate if it is not
Keep a list with the categories that are encountered (they
may be different than below) and another list with the
cumulative dollar amount for each category. These are two
lists but the elements in one relate to the elements in
the other (by position)
Close the file when all the data has been processed
Display the categories and the total for each one
Our Sample text file looks something like this
Bob;Dinner;10.00;January 1, 2015
Tom;Dinner;14.00;January 2, 2015
Anne;Lodging;125.00;January 3, 2015
Jerry;Lodging;125.00;January 4, 2015
Here is what I am trying to do. I am trying to get an understanding of this and have some help from experts on Stack Overflow to solve this problem while learning. Thank you everyone!
import sys
def main():
try:
line = infile.readline()
for line in infile:
inputFileName = input("Input file name: ")
infile = open(inputFileName, "r")
fields = line.split(";")
value = float(fields[1])
except:
print("Error: The file cannot be opened.")
sys.exit(1)
infile.close()
main()
Here's a basic sketch. This is untested so likely contains typos, logic errors and such. Also, it doesn't check all of the error conditions you mentioned. However, it should be enough to get your started. The main trick is to just throw an exception where you encounter an error, and catch it where you can deal with it. That immediately stops processing the file as you wanted. The other trick is to keep a dictionary mapping category to total so you can keep a running total by category.
def main():
# Req 1.1: ask for a filename
file_name = input("Input file name: ")
try:
# To keep things simple we do all the file processing
# in a separate function. That lets us handle
# any error in the file processing with a single
# except block
amount_by_category = process_file(file_name)
# Req 6: display the categories - python will
# display the contents of a data structure when we print() it
print('Totals: ', amount_by_category)
except Exception, e:
# Reqs 1-3: display errors
print('Error processing file:', e)
def process_file(file_name):
# Req 1.2: open the file
infile = open(file_name, 'r')
# Req 4.1: somewhere to remember the categories
amount_by_catgeory = {}
# Reqs 2-4: we are dealing with a many line file
# Req 5: when we reach the end, python closes the file for us automatically
for line in infile:
# Req 2.1: each line should have 4 values separated by ;
fields = line.split(';')
# Req 2.2: does this line have 4 values?
if len(fields) != 4:
raise Exception('Expected 4 fields but found %s' % len(fields))
# Req 3: is the third value a number?
value = float(fields[2])
# Req 4.2: what category does this line belong to?
category = fields[1]
# Req 4.3.1: have we seen this category before?
if not category in amount_by_category:
# Req 4.3.2: accumulations start from 0?
amount_by_category[category] = 0.0f
# Req 4.4: increase the cumulative amount for the category
amount_by_category[category] += value
return amount_by_category
I am searching for text present in non_int_ram_var in trial.txt. non_int_ram_var takes data from an excel sheet which is basically unicode. So I type-cast it to str so that the trial.txt type will match with non_int_ram_var. But the control is not entering into if var in line:. I have cross-verified that var is present in one of the line of the text file. I might be making some mistake. Can anyone give me a suggestion?
from xlrd import open_workbook
import unicodedata
work_book= open_workbook("C:\\Users\\E542639\\Desktop\\non_sram_mem\\SEU_MBU_FINAL_VARIABLE_LIST.xls");
# reading xls file for non_int sram..............
non_int_ram_var = [] * 376
row=1
for sheet in work_book.sheets():
if "Data" == sheet.name :
print sheet.nrows, sheet.ncols
while row < sheet.nrows:
if "int_sram" != sheet.cell(row,5).value:
if "file name only" != sheet.cell(row,7).value :
non_int_ram_var.append(str(sheet.cell(row,1).value))
row=row + 1
# checking variable in mapfile.........
map_file = open("C:\\Users\\E542639\\Desktop\\non_sram_mem\\trial.txt", "r")
non_categorized = open("C:\\Users\\E542639\\Desktop\\non_sram_mem\\non_categorized.txt","w")
print "processing..."
for var in non_int_ram_var:
while True:
line = str.encode(map_file.readline())
if not line: break
if var in line:
print "control here"
non_categorized.writelines(line) # write won't work
print 'done!!!'
map_file.close()
non_categorized.close()
==================================================================
after reading till end of the file in next iteration cursor was not coming to beginning of the file. which was my mistake. thanks poke, ur suggestion shown the way. this what i did now and working fine.
if not line:
map_file.seek(0)
break
Python read the file into str object by default.
I think you could try
for line in map_file:
if var in line:
print "control here"
...