Initial (and hopefully last) commit

2026-03-12 10:37:32 +01:00 · 2026-03-12 10:37:32 +01:00 · 06374b2608
commit 06374b2608
8 changed files with 326 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
 *.npy
 *.odt
 *.pdf
 /out/
 /.venv/
--- a/Buchstaben_extrahieren.py
+++ b/Buchstaben_extrahieren.py
@ -0,0 +1,40 @@
 import cv2
 import numpy as np
 import os
 #User parameters
 output_folder="letters"
 image = cv2.imread("Blockbuchstaben.jpeg")
 number_rows=13
 number_cols=8
 margin=15 #Rand der weggeschnitten wird bei den einzelnen Buchstaben
 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 height, width, channels = image.shape
 print(height)
 print(width)
 letter_width=width//number_cols
 letter_height=height//number_rows
 print(letter_width)
 print(letter_height)
 if not os.path.exists(output_folder):
        os.makedirs(output_folder)
 for i in range(0,number_rows):
    for j in range(0,number_cols):
        x=j*letter_width
        y=i*letter_height
        letter_img=gray[y+margin:y+letter_height-margin,x+margin:x+letter_width-margin]
        #if i==6 and j==8:
        #    cv2.imshow('letter_img', letter_img)
        cv2.imwrite(output_folder+'/letter'+str(i)+str(j)+'.png', letter_img)
 #cv2.imshow('BuchstabenRaster', gray)
 cv2.waitKey(0)
 cv2.destroyAllWindows()
--- a/Buchstaben_extrahieren2.py
+++ b/Buchstaben_extrahieren2.py
@ -0,0 +1,49 @@
 import fitz  # PyMuPDF
 import cv2
 import numpy as np
 from PIL import Image
 def save_letter_images_from_pdf(pdf_path, output_folder):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Get the text boxes
        text_boxes = page.get_text("dict")["blocks"]
        for block in text_boxes:
            if "lines" in block:  # Check if block contains lines
                for line in block["lines"]:
                    for span in line["spans"]:
                        # Extract the bounding box of each letter
                        rect = fitz.Rect(span["bbox"])
                        # Crop the page to the letter's bounding box
                        pix = page.get_pixmap(clip=rect)
                        # Convert to a numpy array
                        img = np.array(pix.samples).reshape(pix.height, pix.width, pix.n)
                        # Convert color space from BGRA to grayscale
                        if pix.n == 4:
                            img = cv2.cvtColor(img, cv2.COLOR_BGRA2GRAY)
                        else:
                            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                        # Resize to 28x28 pixels
                        img_resized = cv2.resize(img, (28, 28), interpolation=cv2.INTER_AREA)
                        # Save the image
                        img_pil = Image.fromarray(img_resized)
                        img_pil.save(f"{output_folder}/letter_page{page_num + 1}_box{len(text_boxes)}.png")
    # Close the document
    doc.close()
 # Example usage
 pdf_path = "/home/mia/Schule/KISY/schrifterkennung/letters.pdf"
 output_folder = "/home/mia/Schule/KISY/schrifterkennung/out/"
 save_letter_images_from_pdf(pdf_path, output_folder)
--- a/app.py
+++ b/app.py
@ -0,0 +1,113 @@
 import tensorflow as tf
 from tensorflow.keras import layers, models, Sequential
 import numpy as np
 import matplotlib.pyplot as plt
 import os
 from PyQt6.QtWidgets import QApplication, QMainWindow, QWidget, QVBoxLayout, QPushButton, QLabel
 from PyQt6.QtGui import QPainter, QPen, QImage
 from PyQt6.QtCore import Qt, QPoint
 import sys
 data_dir = "/home/mia/Schule/KISY/schrifterkennung/" # Ignore full path, had some weird problem otherwise
 model_file = "model.keras" # Model save file. No Idea if this is the correct extention but nobody cares, right?
 print("We have done training already so we load this to not waste very precious cpu :)")
 model = tf.keras.models.load_model(model_file)
 #for images, labels in val_ds.take(10):
 #    preds = model.predict(images)
 #    print(f"Prediction: {class_names[np.argmax(preds[0])]}")
 #    print(f"Label: {class_names[labels[0].numpy().astype(int)]}")
 #    plt.imshow(images[0].numpy().squeeze(), cmap='gray')
 #    plt.title(f"Pred: {class_names[np.argmax(preds[0])]}")
 #    plt.show()
 #### DISCLAIMER: This was written by AI; I hate GUI stuff ####
 class DrawingCanvas(QWidget):
    def __init__(self):
        super().__init__()
        self.setFixedSize(320, 320)  # 10x the model input size for easier drawing
        self.image = QImage(self.size(), QImage.Format.Format_Grayscale8)
        self.image.fill(Qt.GlobalColor.white)
        self.drawing = False
        self.last_point = QPoint()
    def paintEvent(self, event):
        painter = QPainter(self)
        painter.drawImage(0, 0, self.image)
    def mousePressEvent(self, event):
        if event.button() == Qt.MouseButton.LeftButton:
            self.drawing = True
            self.last_point = event.position().toPoint()
    def mouseMoveEvent(self, event):
        if (event.buttons() & Qt.MouseButton.LeftButton) and self.drawing:
            painter = QPainter(self.image)
            # We use a thick white pen because the model was trained on grayscale images
            painter.setPen(QPen(Qt.GlobalColor.black, 18, Qt.PenStyle.SolidLine, Qt.PenCapStyle.RoundCap))
            painter.drawLine(self.last_point, event.position().toPoint())
            self.last_point = event.position().toPoint()
            self.update()
    def clear(self):
        self.image.fill(Qt.GlobalColor.white)
        self.update()
 class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("Handwriting Recognition")
        main_layout = QVBoxLayout()
        self.canvas = DrawingCanvas()
        self.result_label = QLabel("Draw something and click Predict. (Need to fill entire space or model has stroke")
        self.result_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
        predict_btn = QPushButton("Predict")
        predict_btn.clicked.connect(self.predict_image)
        clear_btn = QPushButton("Clear Canvas")
        clear_btn.clicked.connect(self.canvas.clear)
        main_layout.addWidget(self.canvas)
        main_layout.addWidget(self.result_label)
        main_layout.addWidget(predict_btn)
        main_layout.addWidget(clear_btn)
        container = QWidget()
        container.setLayout(main_layout)
        self.setCentralWidget(container)
    def predict_image(self):
        # 1. Resize the drawing to 32x32 to match the model input
        scaled_img = self.canvas.image.scaled(32, 32, Qt.AspectRatioMode.IgnoreAspectRatio,
                                              Qt.TransformationMode.SmoothTransformation)
        # 2. Convert QImage to Numpy Array
        ptr = scaled_img.bits()
        ptr.setsize(32 * 32)
        arr = np.frombuffer(ptr, np.uint8).reshape(32, 32, 1)
        # 3. Add batch dimension and predict
        # Note: We don't manually rescale by 1/255 here because your model has a Rescaling layer built-in!
        img_batch = np.expand_dims(arr, axis=0)
        prediction = model.predict(img_batch, verbose=0)
        print(type(prediction), prediction)
        class_names = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
        print([x for x in zip(class_names, prediction[0])])
        result = class_names[np.argmax(prediction)]
        confidence = np.max(prediction) * 100
        self.result_label.setText(f"Prediction: {result} ({confidence:.1f}%)")
 if __name__ == "__main__":
    app = QApplication(sys.argv)
    app.setStyle("Breeze") # Use system theme so it looks nice on linux
    window = MainWindow()
    window.show()
    sys.exit(app.exec())
--- a/convert_from_big_dataset.py
+++ b/convert_from_big_dataset.py
@ -0,0 +1,29 @@
 import os
 from PIL import Image
 import numpy as np
 directory = "/home/mia/Schule/KISY/schrifterkennung/out"
 files = os.listdir(f"{directory}/BigDataSet")
 print(files)
 labels = []
 images = []
 for letter in files:
    dir = f"{directory}/BigDataSet/{letter}"
    i = 0
    os.makedirs(f"{directory}/Scaled/{letter}/", exist_ok=True)
    for image in os.listdir(dir)[:]:
        print(image)
        img = Image.open(os.path.join(dir, image))
        res = img.resize((32,32))
        res = res.convert("L")
        images.append(res)
        labels.append(letter)  # Label ist der Ordnername
        res.save(f"{directory}/Scaled/{letter}/miakieler_{i}.png")
        i += 1
 X = np.array(images)
 Y = np.array(labels)
 np.save('images_big.npy', X)
 np.save('labels_big.npy', Y)
--- a/make_model.py
+++ b/make_model.py
@ -0,0 +1,72 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow import keras
 from sklearn.model_selection import train_test_split
 from tensorflow.keras.utils import to_categorical
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.callbacks import EarlyStopping
 # Daten einlesen
 X = np.load('images_big.npy')  # Eingabebilder
 Y = np.load('labels_big.npy')  # Labels (Buchstaben)
 # Umwandlung der Labels in numerische Werte
 unique_labels, Y_numeric = np.unique(Y, return_inverse=True)
 # Vorverarbeitung der Daten
 X = X.reshape(-1, 32, 32, 1)  # Umwandlung in 4D-Array (N, R, C, K)
 Y_categorical = to_categorical(Y_numeric)  # One-Hot-Encoding der Labels
 # Aufteilung der Daten in Trainings- und Testdatensätze
 X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y_categorical, test_size=0.1, random_state=42
 )
 # # Aufbau des Modells
 # model = keras.Sequential([
 #     keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 1)),
 #     keras.layers.MaxPooling2D(pool_size=(2, 2)),
 #     keras.layers.Conv2D(64, (3, 3), activation='relu'),
 #     keras.layers.MaxPooling2D(pool_size=(2, 2)),
 #     keras.layers.Flatten(),
 #     keras.layers.Dense(128, activation='relu'),
 #     keras.layers.Dense(len(unique_labels), activation='softmax')  # Anzahl der Klassen
 # ])
 model = keras.Sequential([
    keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(32, 32, 1)),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Conv2D(128, (3, 3), activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Flatten(),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(len(unique_labels), activation='softmax') 
 ])
 # model = keras.Sequential([
 #     keras.layers.Input(shape=(32, 32)),     # images are 28x28
 #     keras.layers.Flatten(),                 # becomes 784
 #     keras.layers.Dense(64, activation='relu'),
 #     keras.layers.Dense(128, activation='relu'),
 #     keras.layers.Dense(len(unique_labels), activation='softmax')  # Anzahl der Klassen
 # ])
 optimizer = Adam(learning_rate=0.001)
 early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
 # Kompilieren des Modells
 model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
 # model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 # Modelldetails anzeigen
 model.summary()
 # Training des Modells
 model.fit(X_train, Y_train, epochs=20, batch_size=128, validation_data=(X_test, Y_test), callbacks=[early_stopping])
 # Bewertung des Modells
 test_loss, test_accuracy = model.evaluate(X_test, Y_test)
 print(f'Testgenauigkeit: {test_accuracy:.4f}')
 model.save("model.keras")
--- a/model.keras
+++ b/model.keras
--- a/shell.nix
+++ b/shell.nix
@ -0,0 +1,18 @@
 let
  pkgs = import <nixpkgs> {};
 in pkgs.mkShell {
  packages = [
    (pkgs.python3.withPackages (python-pkgs: [
      python-pkgs.pygame
      python-pkgs.matplotlib
      python-pkgs.sklearn-compat
      python-pkgs.pandas
      python-pkgs.opencv-python
      python-pkgs.pytesseract
      python-pkgs.tensorflow
      python-pkgs.keras
      python-pkgs.pyqt6
    ]))
  ];
 }