Why do I have different line counts? - python

I made these different programs in different programming languages to count the number of lines of a file, and it turns out that the output differs according to the program, but the strange thing is that some programs have the same results, I was testing them with a 6gb utf-8 xml file with about 146 million lines.
# Python
# Output -> 146114085 lines
import time
lines = 0
start = time.perf_counter()
with open('file_path') as myfile:
for line in myfile:
lines += 1
print("{} lines".format(lines))
end = time.perf_counter()
elapsed = end - start
print(f'Elapsed time: {elapsed:.3f} seconds')
// Java
// Output -> 146114085 lines (just as with python)
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
long startTime = System.currentTimeMillis();
int BUFFER_SIZE = 1024*1024;
String filePath = "file_path";
FileReader file = file = new FileReader(filePath);
BufferedReader reader = new BufferedReader(file, BUFFER_SIZE);
long lines = reader.lines().count();
reader.close();
System.out.println("The number of lines is " + lines);
long elapsedTime = System.currentTimeMillis() - startTime;
System.out.println("Duration in seconds: " + elapsedTime/1000);
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
// Rust
// Output -> 146113746 lines
use std::fs::File;
use std::io::{BufRead, BufReader, Error, Read};
use std::time::Instant;
fn main() {
let file_path = "file_path";
let buffer_size = 1024*1024;
let start = Instant::now();
if let Err(err) = read_file(buffer_size, file_path) {
println!("{}", err);
}
let duration = start.elapsed();
println!("The function took {} seconds to execute", duration.as_secs());
}
fn read_file(buffer_size: usize, file_path: &str) -> Result<(), Error> {
let file = File::open(file_path)?;
let reader = BufReader::with_capacity(buffer_size, file);
let lines = reader.lines().fold(0, |sum, _| sum + 1);
println!("Number of lines {}", lines);
Ok(())
}
// C
// Output -> 146113745 lines (one line less than rust output)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char *argv[]) {
// start time
clock_t start = clock();
// File path
const char* file_path = "file_path";
// Open the file for reading
FILE *fp = fopen(file_path, "r");
// Allocate a buffer to hold the data
const size_t BUFFER_SIZE = 1024*1024;
char *buffer = malloc(BUFFER_SIZE);
// Declare the number of lines variable
unsigned int lines = 0;
// Read the data in chunks
while (!feof(fp)) {
// Read a chunk of data from the file
size_t bytes_read = fread(buffer, 1, BUFFER_SIZE, fp);
// Process the data here...
for (int i = 0; i < bytes_read; i++) {
if (buffer[i] == '\n') {
lines++;
}
}
}
printf("The number of lines %u\n", lines);
// Clean up
free(buffer);
fclose(fp);
// End
clock_t end = clock();
// Calculate the elapsed time in seconds
double elapsed = (double) ((end - start) / CLOCKS_PER_SEC);
printf("Elapsed time: %f seconds", elapsed);
return 0;
}
Finally, the command wc
Output -> 146113745 lines (just as with C)
wc -l file_path
I think the correct answer is Rust's, because it has one more than wc/C, and it is the last line that has no line change as it reaches the end of the file. The cases that cause me confusion are java and python.

My Regex expression for a line is .*?\\n|.+. This works in https://regexr.com/.
For some reason in the file reading implementation I'm using in Python and Java the character '\r' is interpreted as a line feed, but this doesn't happen with the Rust implementation, nor the wc one and obviously neither with the one I made in C (even when it is explicit).
But if I change the conditional ((buffer[i] == '\n') for ((buffer[i] == '\n') || (buffer[i] == '\r')) I get the same value as in Python and Java minus 1.

Related

Linux kernel v5.15 - how to read and write from /proc file by python

I created a kernel module and I want to communicate using a /proc file between the module and a script in python.
I am using the Ubuntu 22.04 kernel version v5.15. I tried to create the /proc file in my module below:
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kdev_t.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/slab.h> // kmalloc()
#include <linux/uaccess.h> // copy_to/from_user()
#include <linux/proc_fs.h>
static struct proc_dir_entry *parent;
char etx_array[] = "hello how are you?";
int len = sizeof(etx_array) / sizeof(char);
static int open_proc(struct inode *inode, struct file *file)
{
pr_info("proc file opend.....\t");
return 0;
}
/*
* This function will be called when we close the procfs file
*/
static int release_proc(struct inode *inode, struct file *file)
{
pr_info("proc file released.....\n");
return 0;
}
/*
* This function will be called when we read the procfs file
*/
static ssize_t read_proc(struct file *filp, char __user *buffer, size_t length, loff_t * offset)
{
pr_info("proc file read.....\n");
if (copy_to_user(buffer, etx_array, len))
pr_err("Data Send : Err!\n");
return length;;
}
/*
* This function will be called when we write the procfs file
*/
static ssize_t write_proc(struct file *filp, const char *buff, size_t len, loff_t * off)
{
pr_info("proc file wrote.....\n");
if (copy_from_user(etx_array, buff, len))
pr_err("Data Write : Err!\n");
return len;
}
static struct proc_ops proc_fops = {
.proc_open = open_proc,
.proc_read = read_proc,
.proc_write = write_proc,
.proc_release = release_proc,
};
static int etx_driver_init(void)
{
proc_create("etx_proc", 0666, parent, &proc_fops);
return 0;
}
static void etx_driver_exit(void)
{
proc_remove(parent);
}
module_init(etx_driver_init);
module_exit(etx_driver_exit);
MODULE_LICENSE("GPL");
And if I try to use /proc file by python3 in user space like this:
import os
pf = open("/proc/etx_proc","r")
print(pf.read())
So I get this (running the python program):
The problem here is that pf.read() in Python reads the entire file. How does the kernel know when the content of your proc file is over? When you return 0 from your .proc_read function. Since you are always copying etx_array out and you are always returning length, this means that to any application that tries to read the file (and also to the kernel itself) it will look like the file has infinite length. Python will keep reading and reading until it eventually runs out of memory and gets killed by the kernel OOM killer (or until your system freezes).
Always returning the requested size (length) is wrong as it does not match the actual length of the data copied to userspace. You have to check the requested size, copy no more than that and also no more than your buffer size (len), then return a meaningful value. You can keep track of how much you read with the loff_t * pointer that gets passed to your functions.
The same reasoning goes for your .proc_write function: it doesn't make sense to always return len. Check the requested size, cap it as needed, then perform the operation and return a meaningful value.
Copying data to/from userspace is a lot trickier than you might think, and there are a lot of pitfalls, you need to check everything that could possibly go wrong, specially when dealing with sizes. Here's an example of how this would work:
// Side note: this is the correct "len" you want.
size_t len = sizeof(etx_array) / sizeof(char) - 1;
// ...
static ssize_t read_proc(struct file *filp, char __user *buf, size_t size, loff_t *off)
{
loff_t offset = *off;
size_t remaining;
pr_info("proc file read\n");
if (offset < 0)
return -EINVAL;
if (offset >= len || size == 0)
return 0;
if (size > len - offset)
size = len - offset;
remaining = copy_to_user(buf, etx_array + offset, size);
if (remaining == size) {
pr_err("copy_to_user failed\n");
return -EFAULT;
}
size -= remaining;
*off = offset + size;
return size;
}
static ssize_t write_proc(struct file *filp, const char *buf, size_t size, loff_t *off)
{
loff_t offset = *off;
size_t remaining;
pr_info("proc file write\n");
if (offset < 0)
return -EINVAL;
if (offset >= len || size == 0)
return 0;
if (size > len - offset)
size = len - offset;
remaining = copy_from_user(etx_array + offset, buf, size);
if (remaining == size) {
pr_err("copy_from_user failed\n");
return -EFAULT;
}
size -= remaining;
*off = offset + size;
return size;
}
Check out this other answer of mine talking about simple_read_from_buffer() / simple_write_to_buffer(), which are simpler kernel interfaces that accomplish exactly the same as the above code for you automatically and (most importantly) correctly.

Issue packing / unpacking message for UART between ESP-01S and Raspberry Pi Pico

My end goal is to set up a Raspberry Pi Pico with an ESP-01S to enable wifi. The Pico will periodically check in with the server and put the ESP to sleep when not in use. Communication between the two is over UART. I want to support both a GET and POST HTTP request from the ESP.
This is the message structure I am using between the two devices.
| Always start with MSG | Request Type | Next message Size
| | Next message size | | | URL
V V V V V
|-----|-------------------|-----|-------------------|-----------|
[M|S|G|\x00\|x00\|x00\|x03|G|E|T|\x00|\x00|\x00|\x1f|h|t|t|p|...]
[M|S|G|\x00\|x00\|x00\|x04|P|O|S|T|URL SIZE...|URL...|\x00|\x00|\x006|POST Data...]
|-----|-------------------|-------|-----------|------|---------------|------------|
^ ^
| | Post Data
| Post data size
For testing purposes I am generating the strings in python, printing them and pasting them directly in the .cpp file I'm flashing onto the ESP.
Here is the snippet of code I'm using on my pc to generate the message.
import struct
import json
url = "http://192.168.X.X:8090/korok"
size_of_url = struct.pack('!I', len(url))
data = json.dumps({
"serial": "12345",
"sensor_data": {"0": 75, "1": 67}
})
size_of_data = struct.pack('!I', len(data))
print(f"{struct.pack('!I', len('GET'))}GET{size_of_url}{url}")
print(f"{struct.pack('!I', len('POST'))}POST{size_of_url}{url}{size_of_data}{data}")
>>>> ...
b'\x00\x00\x00\x03'GETb'\x00\x00\x00\x1f'http://192.168.X.X:8090/korok
b'\x00\x00\x00\x03'GETb'\x00\x00\x00\x1f'http://192.168.X.X:8090/korokb'\x00\x00\x006'{"serial": "12345", "sensor_data": {"0": 75, "1": 67}}
And here is the code running on the ESP. I left comments on some of the behaviors I'm experiencing.
#include <ESP8266WiFi.h>
#include <ESP8266HTTPClient.h>
#define GPIO_STATUS 2
#define BUFFER_SIZE 256
#define DATA_SIZE 4
char buf[BUFFER_SIZE];
void setup() {...}
...
// Struct is used so I can also get the size to use as an offset
// and read out part of the message in the buffer.
struct Message
{
char *value;
unsigned int size;
unsigned int totalMessageSize;
};
Message readMessage(char *data)
{
struct Message message;
message.size = ntohl(*(unsigned int *)(data)); // Unable to read shorthand hex
Serial.println(message.size);
message.totalMessageSize = message.size + DATA_SIZE; // Shorthand is only 3 char
message.value = (char *)malloc(message.size);
Serial.println(message.size);
int idx = DATA_SIZE;
int jdx = 0;
while (idx < message.size + DATA_SIZE && idx < BUFFER_SIZE)
{
message.value[jdx++] = data[idx++];
}
return message;
}
void loop()
{
delay(3000);
char * msg = "\x00\x00\x00\x03GET\x00\x00\x00\x1fhttp://192.168.X.X:8090/korok";
// char *msg = read_message();
if (msg)
{
Serial.print("\n");
int bufferIdx = 0;
struct Message request = readMessage(msg);
bufferIdx = request.totalMessageSize;
// This is odd and doing it due to odd behavior with ntohl and a variable
// offset. See below.
memcpy(msg, msg + bufferIdx, BUFFER_SIZE - bufferIdx);
struct Message url = readMessage(msg);
struct Message data;
if (memcmp(request.value, "GET", 3) == 0)
{
}
else if (memcmp(request.value, "POST", 4) == 0)
{
bufferIdx = url.totalMessageSize;
memcpy(msg, msg + bufferIdx, BUFFER_SIZE - bufferIdx);
struct Message data = readMessage(msg);
Serial.println(data.value);
}
free(request.value);
free(url.value);
if (memcmp(request.value, "POST", 4) == 0)
{
free(data.value);
}
}
}
Here is one of the two issues, though I found a workaround by doing a memcpy of my original char* offsetting the beginning index. The first line below works but the second throws LoadStoreAlignmentCause exception.
Ideally I'd like to understand what's going on here and to get this working without the memcpy.
ntohl(*(unsigned int *)(msg + 7)); // Works
int offset = 7;
ntohl(*(unsigned int *)(msg + offset)); // Throws Exception (9) LoadStoreAlignmentCause
The main issue I'm experiencing is when I'm packing the size in python some hex values are being shorthanded. e.g struct.pack('!I', 54) == \x00\x00\x006
When this happens ntohl() seems to read an address it shouldn't and outputs 635. Also since I'm expecting four char the rest of the message is off by one index.
A few questions regarding this issue. What is the name of this shorthand hex syntax? Is there anyway to get python to not output this short hand? Or are there any suggestions on how to get this working on the ESP?

C char array from python string

I have a list of strings in python which I'm trying to pass down to a C extension for character analysis. I've gotten so far as to have the list broken up into their individual string PyObjects. Next, I'm hoping to split these strings into their individual characters so that every string PyObject is now a corresponding C-type character array. I can't seem to figure out how to do this though.
Here's what I have so far: Currently after building the .pyd file it will return a list of 1's as a filler to Python (so everything else works), I just don't know how to split a string PyObject into the C-type character array.
--- cExt.c ---
#include <Python.h>
#include <stdio.h>
static int *CitemCheck(PyObject *commandString, int commandStringLength) {
// HAALP
//char* commandChars = (char*) malloc(commandStringLength*sizeof(char*));
// char c[] = PyString_AsString("c", commandString);
// printf("%c" , c);
// printf("%s", PyString_AsString(commandString));
// for (int i=0; i<sizeof(commandChars)/sizeof(*commandChars); i++) {
// printf("%s", PyString_AsString(commandString));
// printf("%c", commandChars[i]);
// }
return 1; // TODO: RETURN PROPER RESULTANT
}
static PyObject *ClistCheck(PyObject *commandList, int commandListLength) {
PyObject *results = PyList_New(commandListLength);
for (int index = 0; index < commandListLength; index++) {
PyObject *commandString;
commandString = PyList_GetItem(commandList, index);
int commandStringLength = PyObject_Length(commandString);
// CitemCheck should take string PyObject and its length as int
int x = CitemCheck(commandString, commandStringLength);
PyObject* pyItem = Py_BuildValue("i", x);
PyList_SetItem(results, index, pyItem);
}
return results;
}
static PyObject *parseListCheck(PyObject *self, PyObject *args) {
PyObject *commandList;
int commandListLength;
if (!PyArg_ParseTuple(args, "O", &commandList)){
return NULL;
}
commandListLength = PyObject_Length(commandList);
return Py_BuildValue("O", ClistCheck(commandList, commandListLength));
}
static char listCheckDocs[] =
""; // TODO: ADD DOCSTRING
static PyMethodDef listCheck[] = {
{"listCheck", (PyCFunction) parseListCheck, METH_VARARGS, listCheckDocs},
{NULL,NULL,0,NULL}
};
static struct PyModuleDef DCE = {
PyModuleDef_HEAD_INIT,
"listCheck",
NULL,
-1,
listCheck
};
PyMODINIT_FUNC PyInit_cExt(void){
return PyModule_Create(&DCE);
}
for reference, my temporary extension build file:
--- _c_setup.py ---
(located in same folder as cExt.c)
"""
to build C files, pass:
python _c_setup.py build_ext --inplace clean --all
in command prompt which is cd'd to the file's dierctory
"""
import glob
from setuptools import setup, Extension, find_packages
from os import path
here = path.abspath(path.dirname(__file__))
files = [path.split(x)[1] for x in glob.glob(path.join(here, '**.c'))]
extensions = [Extension(
path.splitext(x)[0], [x]
) for x in files]
setup(
ext_modules = extensions,
)
You can use PyUnicode_AsEncodedString, which
Encode a Unicode object and return the result as Python bytes object. encoding and errors have the same meaning as the parameters of the same name in the Unicode encode() method. The codec to be used is looked up using the Python codec registry. Return NULL if an exception was raised by the codec.
see https://docs.python.org/3/c-api/unicode.html#c.PyUnicode_AsEncodedString
Then with PyBytes_AsString you get a pointer to internal buffer with a terminating NUL byte. This buffer must neither be deallocated nor modified. If you need a copy you could use e.g. strdup.
see https://docs.python.org/3/c-api/bytes.html#c.PyBytes_AsString
Slightly modifying your code it could look like this:
PyObject *encodedString = PyUnicode_AsEncodedString(commandString, "UTF-8", "strict");
if (encodedString) { //returns NULL if an exception was raised
char *commandChars = PyBytes_AsString(encodedString); //pointer refers to the internal buffer of encodedString
if(commandChars) {
printf("the string '%s' consists of the following chars:\n", commandChars);
for (int i = 0; commandChars[i] != '\0'; i++) {
printf("%c ", commandChars[i]);
}
printf("\n");
}
Py_DECREF(encodedString);
}
If one would test with:
import cExt
fruits = ["apple", "pears", "cherry", "pear", "blueberry", "strawberry"]
res = cExt.listCheck(fruits)
print(res)
The output would be:
the string 'apple' consists of the following chars:
a p p l e
the string 'pears' consists of the following chars:
p e a r s
the string 'cherry' consists of the following chars:
c h e r r y
the string 'pear' consists of the following chars:
p e a r
the string 'blueberry' consists of the following chars:
b l u e b e r r y
the string 'strawberry' consists of the following chars:
s t r a w b e r r y
[1, 1, 1, 1, 1, 1]
Side note not directly related to the question:
Your CitemCheck function returns a pointer to int, but if looking at how it is called, it seems that you want to return an int value. The function signature should look more like this:
static int CitemCheck(PyObject *commandString, int commandStringLength)
(note the removed * after int).

Converting comment // with /* with Python re

In my C program, I should convert every "// comment" with "/* comment */"
I thought of the code below with Python re, but could't come up with any idea of how to
insert */ before at the end of the line.
regexp = re.compile(r'//', re.MULTILINE)
c = regexp.sub('/*',c)
Any idea will be grateful. Thank you.
Here's a start for a (IMO) "better" solution. It uses regex to scan through the input and match not only single line comments, but also those tokens that can contain //:
#!/usr/bin/env python
import re
import sys
source = """
/*
* not
// a single line
// comment
*/
// include stuff
#include<stdio.h>
main()
{
// comments with a '*/' !
printf("Again, no single // line comment"); // comment
}
"""
pattern = re.compile(r""" (//([^\r\n]*)) # match a single line comment
| "[^"]*" # match a string literal
| /\*.*?\*/ # match a multi line comment
| . # fall through: match any char
"""
, re.X | re.S)
print("%s" % source)
print('=' * 40)
for match in pattern.finditer(source):
if match.group(1):
# we found a single line comment
s = match.group(2).replace('*/', '* /')
sys.stdout.write("/*" + s + " */")
else:
# something other than a single line comment, just print it
sys.stdout.write(match.group())
which will print:
/*
* not
// a single line
// comment
*/
// include stuff
#include<stdio.h>
main()
{
// comments with a '*/' !
printf("Again, no single // line comment"); // comment
}
========================================
/*
* not
// a single line
// comment
*/
/* include stuff */
#include<stdio.h>
main()
{
/* comments with a '* /' ! */
printf("Again, no single // line comment"); /* comment */
}
Adapted from a previous post that was looking for single line comments in Java, you can use pyparsing:
data = """
class HelloWorld {
// method main(): ALWAYS the APPLICATION entry point
public static void main (String[] args) {
System.out.println("Hello World!"); // Nested //Print 'Hello World!'
System.out.println("http://www.example.com"); // Another nested // Print a URL
System.out.println("\"http://www.example.com"); // A nested escaped quote // Print another URL
}
}"""
from pyparsing import *
dbls = QuotedString('"', '\\', '"')
sgls = QuotedString("'", '\\', "'")
strings = dbls | sgls
g = dblSlashComment.ignore(strings)
g.setParseAction(lambda L,: '/*' + L[0].lstrip('//') + '*/')
print g.transformString(data)
Outputs:
class HelloWorld {
/* method main(): ALWAYS the APPLICATION entry point*/
public static void main (String[] args) {
System.out.println("Hello World!"); /* Nested //Print 'Hello World!'*/
System.out.println("http://www.example.com"); /* Another nested // Print a URL*/
System.out.println(""http://www.example.com"); /* A nested escaped quote // Print another URL*/
}
}

Brute forcing DES with a weak key

I am taking a course on Cryptography and am stuck on an assignment. The instructions are as follows:
The plaintext plain6.txt has been encrypted with DES to encrypt6.dat using a 64-bit key given as a string of 8 characters (64
bits of which every 8th bit is ignored), all characters being letters
(lower-case or upper-case) and digits (0 to 9).
To complete the assignment, send me the encryption key before February
12, 23.59.
Note: I expect to get an 8-byte (64-bits) key. Each byte should
coincide with the corresponding byte in my key, except for the least
significant bit which is not used in DES and thus, could be arbitrary.
Here is the code to my first attempt in Python:
import time
from Crypto.Cipher import DES
class BreakDES(object):
def __init__(self, file, passwordLength = 8, testLength = 8):
self.file = file
self.passwordLength = passwordLength
self.testLength = testLength
self.EncryptedFile = open(file + '.des')
self.DecryptedFile = open(file + '.txt')
self.encryptedChunk = self.EncryptedFile.read(self.testLength)
self.decryptedChunk = self.DecryptedFile.read(self.testLength)
self.start = time.time()
self.counter = 0
self.chars = range(48, 58) + range(65, 91) + range(97, 123)
self.key = False
self.broken = False
self.testPasswords(passwordLength, 0, '')
if not self.broken:
print "Password not found."
duration = time.time() - self.start
print "Brute force took %.2f" % duration
print "Tested %.2f per second" % (self.counter / duration)
def decrypt(self):
des = DES.new(self.key.decode('hex'))
if des.decrypt(self.encryptedChunk) == self.decryptedChunk:
self.broken = True
print "Password found: 0x%s" % self.key
self.counter += 1
def testPasswords(self, width, position, baseString):
for char in self.chars:
if(not self.broken):
if position < width:
self.testPasswords(width, position + 1, baseString + "%c" % char)
self.key = (baseString + "%c" % char).encode('hex').zfill(16)
self.decrypt()
# run it for password length 4
BreakDES("test3", 4)
I am getting a speed of 60.000 tries / second. A password of 8 bytes over 62 characters gives 13 trillion possibilities, which means that at this speed it would take me 130 years to solve. I know that this is not an efficient implementation and that I could get better speeds in a faster language like C or it's flavors, but I have never programmed in those. Even if I get a speed-up of 10, we're still a huge leap away from 10,000,000,000 per second to get in the hours range.
What am I missing? This is supposed to be a weak key :). Well, weaker than the full 256 character set.
EDIT
Due to some ambiguity about the assignment, here is the full description and some test files for calibration: http://users.abo.fi/ipetre/crypto/assignment6.html
EDIT 2
This is a crude C implementation that gets around 2.000.000 passwords/s per core on an i7 2600K. You have to specify the first character of the password and can manually run multiple instances on different cores/computers. I managed to solve the problem using this within a couple of hours on four computers.
#include <stdio.h> /* fprintf */
#include <stdlib.h> /* malloc, free, exit */
#include <unistd.h>
#include <string.h> /* strerror */
#include <signal.h>
#include <openssl/des.h>
static long long unsigned nrkeys = 0; // performance counter
char *
Encrypt( char *Key, char *Msg, int size)
{
static char* Res;
free(Res);
int n=0;
DES_cblock Key2;
DES_key_schedule schedule;
Res = ( char * ) malloc( size );
/* Prepare the key for use with DES_ecb_encrypt */
memcpy( Key2, Key,8);
DES_set_odd_parity( &Key2 );
DES_set_key_checked( &Key2, &schedule );
/* Encryption occurs here */
DES_ecb_encrypt( ( unsigned char (*) [8] ) Msg, ( unsigned char (*) [8] ) Res,
&schedule, DES_ENCRYPT );
return (Res);
}
char *
Decrypt( char *Key, char *Msg, int size)
{
static char* Res;
free(Res);
int n=0;
DES_cblock Key2;
DES_key_schedule schedule;
Res = ( char * ) malloc( size );
/* Prepare the key for use with DES_ecb_encrypt */
memcpy( Key2, Key,8);
DES_set_odd_parity( &Key2 );
DES_set_key_checked( &Key2, &schedule );
/* Decryption occurs here */
DES_ecb_encrypt( ( unsigned char (*) [8]) Msg, ( unsigned char (*) [8]) Res,
&schedule, DES_DECRYPT );
return (Res);
}
void ex_program(int sig);
int main(int argc, char *argv[])
{
(void) signal(SIGINT, ex_program);
if ( argc != 4 ) /* argc should be 2 for correct execution */
{
printf( "Usage: %s ciphertext plaintext keyspace \n", argv[0] );
exit(1);
}
FILE *f, *g;
int counter, i, prime = 0, len = 8;
char cbuff[8], mbuff[8];
char letters[] = "02468ACEGIKMOQSUWYacegikmoqsuwy";
int nbletters = sizeof(letters)-1;
int entry[len];
char *password, *decrypted, *plain;
if(atoi(argv[3]) > nbletters-2) {
printf("The range must be between 0-%d\n", nbletters-2);
exit(1);
}
prime = atoi(argv[1])
// read first 8 bytes of the encrypted file
f = fopen(argv[1], "rb");
if(!f) {
printf("Unable to open the file\n");
return 1;
}
for (counter = 0; counter < 8; counter ++) cbuff[counter] = fgetc(f);
fclose(f);
// read first 8 bytes of the plaintext file
g = fopen(argv[2], "r");
if(!f) {
printf("Unable to open the file\n");
return 1;
}
for (counter = 0; counter < 8; counter ++) mbuff[counter] = fgetc(g);
fclose(g);
plain = malloc(8);
memcpy(plain, mbuff, 8);
// fill the keys
for(i=0 ; i<len ; i++) entry[i] = 0;
entry[len-1] = prime;
// loop until the length is reached
do {
password = malloc(8);
decrypted = malloc(8);
// build the pasword
for(i=0 ; i<len ; i++) password[i] = letters[entry[i]];
nrkeys++;
// end of range and notices
if(nrkeys % 10000000 == 0) {
printf("Current key: %s\n", password);
printf("End of range ");
for(i=0; i<len; i++) putchar(letters[lastKey[i]]);
putchar('\n');
}
// decrypt
memcpy(decrypted,Decrypt(password,cbuff,8), 8);
// compare the decrypted with the mbuff
// if they are equal, exit the loop, we have the password
if (strcmp(mbuff, decrypted) == 0)
{
printf("We've got it! The key is: %s\n", password);
printf("%lld keys searched\n", nrkeys);
exit(0);
}
free(password);
free(decrypted);
// spin up key until it overflows
for(i=0 ; i<len && ++entry[i] == nbletters; i++) entry[i] = 0;
} while(i<len);
return 0;
}
void ex_program(int sig) {
printf("\n\nProgram terminated %lld keys searched.\n", nrkeys);
(void) signal(SIGINT, SIG_DFL);
exit(0);
}
I would assume the desired solution is to actually implement the algorithmn. Then, since your're decrypting yourself, you can bail early, which, assuming the plain text is also A-Za-z0-9, gives you a 98% chance of being able to stop after decrypting a single byte, a 99.97% chance of stoping after decrypting 2 bytes, and a 99.9995% chance of stopping after 3 bytes.
Also, use C or Ocaml or something like that. You're probably spending MUCH more time doing string manipulation than you are doing cryption. Or, at least use multi-processing and spin up all your cores...
There is an obvious factor 256 speedup: One bit per byte isn't part of the key. DES has only a 56 bit key, but one passes in 64 bits. Figure out which bit it is, and throw away equivalent characters.
I've had quite a bit of help and this is a solution in C. As I am a C beginner, it's probably full of bugs and bad practice, but it works.
As CodeInChaos figured out, only 31 characters need to be tested, because DES ignores every 8th bit of the key, making for example ASCII characters b: *0110001*0 and c: *0110001*1 identical in encryption/decryption when used as a part of the key.
I am using the OpenSSL library for DES decryption. On my machine the speed it achieves is ~1.8 million passwords per second, which puts the total time to test the entire key space to around 5 days. This falls a day short of the deadline. A lot better than the Python code above which is in the years territory.
There is still room for improvement, probably the code could be optimized and threaded. If I could use all my cores I estimate the time would go down to a bit more than a day, however I have no experience with threading yet.
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <openssl/des.h>
static long long unsigned nrkeys = 0; // performance counter
char *
Encrypt( char *Key, char *Msg, int size)
{
static char* Res;
free(Res);
int n=0;
DES_cblock Key2;
DES_key_schedule schedule;
Res = ( char * ) malloc( size );
/* Prepare the key for use with DES_ecb_encrypt */
memcpy( Key2, Key,8);
DES_set_odd_parity( &Key2 );
DES_set_key_checked( &Key2, &schedule );
/* Encryption occurs here */
DES_ecb_encrypt( ( unsigned char (*) [8] ) Msg, ( unsigned char (*) [8] ) Res,
&schedule, DES_ENCRYPT );
return (Res);
}
char *
Decrypt( char *Key, char *Msg, int size)
{
static char* Res;
free(Res);
int n=0;
DES_cblock Key2;
DES_key_schedule schedule;
Res = ( char * ) malloc( size );
/* Prepare the key for use with DES_ecb_encrypt */
memcpy( Key2, Key,8);
DES_set_odd_parity( &Key2 );
DES_set_key_checked( &Key2, &schedule );
/* Decryption occurs here */
DES_ecb_encrypt( ( unsigned char (*) [8]) Msg, ( unsigned char (*) [8]) Res,
&schedule, DES_DECRYPT );
return (Res);
}
void ex_program(int sig);
int main()
{
(void) signal(SIGINT, ex_program);
FILE *f, *g; // file handlers
int counter, i, len = 8; // counters and password length
char cbuff[8], mbuff[8]; // buffers
char letters[] = "02468ACEGIKMOQSUWYacegikmoqsuwy"; // reduced letter pool for password brute force
int nbletters = sizeof(letters)-1;
int entry[len];
char *password, *decrypted;
// read first 8 bytes of the encrypted file
f = fopen("test2.dat", "rb");
if(!f) {
printf("Unable to open the file\n");
return 1;
}
for (counter = 0; counter < 8; counter ++) cbuff[counter] = fgetc(f);
fclose(f);
// read first 8 bytes of the plaintext file
g = fopen("test2.txt", "r");
if(!f) {
printf("Unable to open the file\n");
return 1;
}
for (counter = 0; counter < 8; counter ++) mbuff[counter] = fgetc(g);
fclose(g);
// fill the initial key
for(i=0 ; i<len ; i++) entry[i] = 0;
// loop until the length is reached
do {
password = malloc(8);
decrypted = malloc(8);
// build the pasword
for(i=0 ; i<len ; i++) password[i] = letters[entry[i]];
nrkeys++;
if(nrkeys % 10000000 == 0) {
printf("Current key: %s\n", password);
}
// decrypt
memcpy(decrypted,Decrypt(password,cbuff,8), 8);
// compare the decrypted with the mbuff
// if they are equal, exit the loop, we have the password
if (strcmp(mbuff, decrypted) == 0)
{
printf("We've got it! The key is: %s\n", password);
printf("%lld keys searched", nrkeys);
exit(0);
}
free(password);
free(decrypted);
// spin up key until it overflows
for(i=0 ; i<len && ++entry[i] == nbletters; i++) entry[i] = 0;
} while(i<len);
return 0;
}
void ex_program(int sig) {
printf("\n\nProgram terminated %lld keys searched.\n", nrkeys);
(void) signal(SIGINT, SIG_DFL);
exit(0);
}
I can't help but notice the wording of the assignment: you are not actually requested to provide a DES implementation or cracker yourself. If that is indeed the case, why don't you take a look at tools such as John the Ripper or hashcat.
This answer may be complementary to other more specific suggestions but the first thing you should do is run a profiler.
There are really nice examples here:
How can you profile a python script?
EDIT:
For this particular task, I realize it will not help. A trial frequency of 10 GHz is... Hard on a single machine with frequency less than that. Perhaps you could mention what hardware you have available.
Also, don't aim for running it during a few hours. When you find a method that gives a reasonable probability of success within the week you have then let it run while improving your methods.

Categories