From eb6a40b82cc5ff7b9b8efac5520bfee608b9ab7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joe=20K=C3=BCng?= Date: Wed, 6 Nov 2024 10:30:03 +0100 Subject: [PATCH] work in progress program.cs --- capzlog-ExtractDataFromPDF/Program.cs | 100 +++++++++++++++++- .../capzlog-ExtractDataFromPDF.csproj | 7 ++ 2 files changed, 104 insertions(+), 3 deletions(-) diff --git a/capzlog-ExtractDataFromPDF/Program.cs b/capzlog-ExtractDataFromPDF/Program.cs index 8a84e20..ce41fd1 100644 --- a/capzlog-ExtractDataFromPDF/Program.cs +++ b/capzlog-ExtractDataFromPDF/Program.cs @@ -1,4 +1,98 @@ -// See https://aka.ms/new-console-template for more information -using System; +using System; +using System.IO; +using System.Text; +using System.Text.RegularExpressions; +using capzlog_ExtractDataFromPDF.pdfsharp; +using iText.Kernel.Pdf.Canvas.Parser; +using iText.Kernel.Pdf.Canvas.Parser.Listener; +using UglyToad.PdfPig; -Console.WriteLine("Hello, World!"); \ No newline at end of file + +namespace capzlog_ExtractDataFromPDF +{ + public class ParseCzech + { + public static readonly String DEST = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\text.txt"; + public static readonly String SRC = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\Task 1 - Extract Data from a PDF File - Sample File.pdf"; + public static void Main(String[] args) + { + using (var pdfDoc = PdfDocument.Open(SRC)) + { + //Prova prova = new Prova(); + //prova.ProvaRead(SRC); + + + + // var testoEstratto = EstraiTestoDaPosizione(SRC, pagina: 85, xMin: 400, yMin: 400, xMax: 800, yMax: 800); + + //Console.WriteLine("Testo Estratto:"); + // Console.WriteLine(testoEstratto); + + //EstraiCrewConRegex(SRC, 85); + + SinglePageReader reader = new SinglePageReader(); + reader.GetCrewAndFlightAssignment(pdfDoc, 89); + } + + static string EstraiTestoDaPosizione(string filePath, int pagina, double xMin, double yMin, double xMax, double yMax) + { + using (PdfDocument pdf = PdfDocument.Open(filePath)) + { + var paginaPdf = pdf.GetPage(pagina); + List testoNellaZona = new List(); + + foreach (var parola in paginaPdf.GetWords()) + { + var posizione = parola.BoundingBox; + + if (posizione.Left >= xMin && posizione.Right <= xMax && + posizione.Bottom >= yMin && posizione.Top <= yMax) + { + testoNellaZona.Add(parola.Text); + } + } + + return string.Join(" ", testoNellaZona); + } + } + + static void EstraiCrewConRegex(string filePath, int pagina) + { + List crew = new List(); + using (PdfDocument pdf = PdfDocument.Open(filePath)) + { + var paginaPdf = pdf.GetPage(pagina); + string testo = paginaPdf.Text; + + // Regex per catturare ruolo, codice, nome e posizione in maniera compatta + var regex = new Regex(@"\b(CMD|COP|CAB|SEN)\s(\w+)\s([A-Za-z]+\s[A-Za-z]+)\s(.+?)(?=\bCMD|\bCOP|\bCAB|\bSEN|Observer|Contacts|Crew|\Z)"); + + foreach (Match match in regex.Matches(testo)) + { + string ruolo = match.Groups[1].Value; + string codice = match.Groups[2].Value; + string nome = match.Groups[3].Value; + string posizione = match.Groups[4].Value; + + crew.Add($"{ruolo} {codice} {nome} - {posizione}"); + } + } + } + + + + + // + + // reader.ReadPage(SRC, 85); // Read page 2 for example + // + //FileInfo file = new FileInfo(DEST); + //file.Directory.Create(); + + //new ParseCzech().ManipulatePdf(DEST); + } + + + + } +} \ No newline at end of file diff --git a/capzlog-ExtractDataFromPDF/capzlog-ExtractDataFromPDF.csproj b/capzlog-ExtractDataFromPDF/capzlog-ExtractDataFromPDF.csproj index c19000d..b97475e 100644 --- a/capzlog-ExtractDataFromPDF/capzlog-ExtractDataFromPDF.csproj +++ b/capzlog-ExtractDataFromPDF/capzlog-ExtractDataFromPDF.csproj @@ -8,4 +8,11 @@ enable + + + + + + +