diff --git a/capzlog-ExtractDataFromPDF/Program.cs b/capzlog-ExtractDataFromPDF/Program.cs index 52ce5ed..67b9041 100644 --- a/capzlog-ExtractDataFromPDF/Program.cs +++ b/capzlog-ExtractDataFromPDF/Program.cs @@ -8,84 +8,31 @@ namespace capzlog_ExtractDataFromPDF { public class ParseCzech { - public static readonly String DEST = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\text.txt"; public static readonly String SRC = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\Task 1 - Extract Data from a PDF File - Sample File.pdf"; public static void Main(String[] args) { using (var pdfDoc = PdfDocument.Open(SRC)) { - //Prova prova = new Prova(); - //prova.ProvaRead(SRC); - - - // var testoEstratto = EstraiTestoDaPosizione(SRC, pagina: 85, xMin: 400, yMin: 400, xMax: 800, yMax: 800); - - //Console.WriteLine("Testo Estratto:"); - // Console.WriteLine(testoEstratto); - - //EstraiCrewConRegex(SRC, 85); SinglePageReader reader = new SinglePageReader(); - reader.GetCrewAndFlightAssignment(pdfDoc, 89); + string content = reader.GetCrewAndFlightAssignment(pdfDoc, 89); + BriefingExtractor briefingExtractor = new BriefingExtractor(content); + Console.WriteLine(""+briefingExtractor.ExtractCrew()[0].Function); + Console.WriteLine(briefingExtractor.ExtractPassengers().Business); + Console.WriteLine(briefingExtractor.ExtractFlightAssignment().DOI); + + string content2 = reader.GetCrewAndFlightAssignment(pdfDoc, 12); + + FlightPlanExtractor flightPlanExtractor = new FlightPlanExtractor(content2); + + Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().FuelData.Limc); + Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().MassLoad.ZeroFuelMass); + Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().Schedule.ScheduledArrivalTime); + Console.WriteLine("GainLoss: "+flightPlanExtractor.ExtractFlightPlan().Correction.GainOrLoss); + } - static string EstraiTestoDaPosizione(string filePath, int pagina, double xMin, double yMin, double xMax, double yMax) - { - using (PdfDocument pdf = PdfDocument.Open(filePath)) - { - var paginaPdf = pdf.GetPage(pagina); - List testoNellaZona = new List(); - - foreach (var parola in paginaPdf.GetWords()) - { - var posizione = parola.BoundingBox; - - if (posizione.Left >= xMin && posizione.Right <= xMax && - posizione.Bottom >= yMin && posizione.Top <= yMax) - { - testoNellaZona.Add(parola.Text); - } - } - - return string.Join(" ", testoNellaZona); - } - } - - static void EstraiCrewConRegex(string filePath, int pagina) - { - List crew = new List(); - using (PdfDocument pdf = PdfDocument.Open(filePath)) - { - var paginaPdf = pdf.GetPage(pagina); - string testo = paginaPdf.Text; - - // Regex per catturare ruolo, codice, nome e posizione in maniera compatta - var regex = new Regex(@"\b(CMD|COP|CAB|SEN)\s(\w+)\s([A-Za-z]+\s[A-Za-z]+)\s(.+?)(?=\bCMD|\bCOP|\bCAB|\bSEN|Observer|Contacts|Crew|\Z)"); - - foreach (Match match in regex.Matches(testo)) - { - string ruolo = match.Groups[1].Value; - string codice = match.Groups[2].Value; - string nome = match.Groups[3].Value; - string posizione = match.Groups[4].Value; - - crew.Add($"{ruolo} {codice} {nome} - {posizione}"); - } - } - } - - - - - // - - // reader.ReadPage(SRC, 85); // Read page 2 for example - // - //FileInfo file = new FileInfo(DEST); - //file.Directory.Create(); - - //new ParseCzech().ManipulatePdf(DEST); } diff --git a/capzlog-ExtractDataFromPDF/SinglePageReader.cs b/capzlog-ExtractDataFromPDF/SinglePageReader.cs index 904e1b9..d8ad912 100644 --- a/capzlog-ExtractDataFromPDF/SinglePageReader.cs +++ b/capzlog-ExtractDataFromPDF/SinglePageReader.cs @@ -8,79 +8,17 @@ namespace capzlog_ExtractDataFromPDF; //GET flightAssigment and flight crew public class SinglePageReader { - // Method to read a specific page - // public void ReadPage(string pdfPath, int pageNumber) - // { - // using (PdfDocument pdfDoc = new PdfDocument(new PdfReader(pdfPath))) - // { - // if (pageNumber < 1 || pageNumber > pdfDoc.GetNumberOfPages()) - // { - // Console.WriteLine($"Page {pageNumber} does not exist in the document."); - // return; - // } - // - // // Create a text extraction renderer - // LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); - // - // // Process the specified page content - // PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); - // parser.ProcessPageContent(pdfDoc.GetPage(pageNumber)); - // - // // Extracted text from the specified page - // string pageText = strategy.GetResultantText(); - // - // // Log the extracted text to the console (for now, we don't return anything) - // Console.WriteLine($"Text from Page {pageNumber}:"); - // Console.WriteLine(pageText); - // } - // } + - public void GetCrewAndFlightAssignment(PdfDocument pdfDocument, int pageNumber) + public string GetCrewAndFlightAssignment(PdfDocument pdfDocument, int pageNumber) { ExtractText extractText = new ExtractText(); var textBlocks = extractText.ExtractTextBlocks(pdfDocument, pageNumber); - + return textBlocks; - - - Console.WriteLine(textBlocks); - - - - } - private List GetCrewData(IEnumerable textBlocks) - { - List crewList = new List(); - var data = textBlocks.ElementAt(0).Text.Split('\n').ToList(); - - var nameraw = textBlocks.ElementAt(3).Text.Split('\n').ToList(); - - if (data.Count>0) - { - for (int i = 2; i < data.Count; i++) - { - var crew = new Crew(); - crew.Function = data[i]; - - string[] parts = nameraw[i-1].Split(new[] { ' ' }, 2); // Divide al primo spazio - - string part1 = parts[0]; // "VEN" - string part2 = parts.Length > 1 ? parts[1] : ""; // "Nico Verhelst" - - crew.Name = part2; - crew.Lc = part1; - - crewList.Add(crew); - - } - - } - - return crewList; - } } \ No newline at end of file diff --git a/capzlog-ExtractDataFromPDF/models/Crew.cs b/capzlog-ExtractDataFromPDF/models/Crew.cs new file mode 100644 index 0000000..6f99d7b --- /dev/null +++ b/capzlog-ExtractDataFromPDF/models/Crew.cs @@ -0,0 +1,22 @@ +namespace capzlog_ExtractDataFromPDF.models +{ + public class Crew + { + public string Function { get; set; } = ""; + public string Lc { get; set; } = ""; + public string Name { get; set; } = ""; + + } + + public class FlightAssigment + { + public double DOW { get; set; } = 0; + public double DOI { get; set; } = 0; + } + public class Passegers + { + public int Business { get; set; } = 0; + public int Economy { get; set; } = 0; + } +} + diff --git a/capzlog-ExtractDataFromPDF/models/crew.cs b/capzlog-ExtractDataFromPDF/models/crew.cs deleted file mode 100644 index 8ef94ea..0000000 --- a/capzlog-ExtractDataFromPDF/models/crew.cs +++ /dev/null @@ -1,10 +0,0 @@ -namespace capzlog_ExtractDataFromPDF.models; - -public class Crew -{ - public string Function { get; set; } = ""; - public string Lc { get; set; } = ""; - public string Name { get; set; } = ""; - public string FunctionExtended { get; set; } = ""; - -} \ No newline at end of file diff --git a/capzlog-ExtractDataFromPDF/util/BriefingExtractor.cs b/capzlog-ExtractDataFromPDF/util/BriefingExtractor.cs new file mode 100644 index 0000000..e485f58 --- /dev/null +++ b/capzlog-ExtractDataFromPDF/util/BriefingExtractor.cs @@ -0,0 +1,109 @@ +using System.Text.RegularExpressions; +using capzlog_ExtractDataFromPDF.models; + +namespace capzlog_ExtractDataFromPDF; + +public class BriefingExtractor +{ + private string briefingText; + + public BriefingExtractor(string briefingText) + { + this.briefingText = briefingText; + } + + public FlightAssigment ExtractFlightAssignment() + { + FlightAssigment flightAssignment = new FlightAssigment(); + + var dowMatch = Regex.Match(briefingText, @"DOW:\s*(\d+(?:\.\d+)?)kg"); + if (dowMatch.Success) + { + flightAssignment.DOW = double.Parse(dowMatch.Groups[1].Value); + } + + var doiMatch = Regex.Match(briefingText, @"DOI:\s*(\d+(?:\.\d+)?)"); + if (doiMatch.Success) + { + flightAssignment.DOI = double.Parse(doiMatch.Groups[1].Value); + } + + return flightAssignment; + } + + public Passegers ExtractPassengers() + { + Passegers passengers = new Passegers(); + + var paxMatch = Regex.Match(briefingText, @"\d+\/(\d+)"); + if (paxMatch.Success) + { + passengers.Business = int.Parse(paxMatch.Groups[0].Value.Split("/")[0]);; + passengers.Economy = int.Parse(paxMatch.Groups[1].Value); + } + + return passengers; + } + + //TODO: check for multiple crew members in the same function + public List ExtractCrew() + { + List crewList = new List(); + + var lines = briefingText.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries); + List combinedLines = new List(); + + string currentLine = ""; + foreach (var line in lines) + { + var trimmedLine = line.Trim(); + + if (Regex.IsMatch(trimmedLine, @"^(CMD|COP|CAB|SEN)\s[A-Z]{3}")) + { + if (!string.IsNullOrEmpty(currentLine)) + { + combinedLines.Add(currentLine.Trim()); + } + + currentLine = trimmedLine; + } + else + { + currentLine += " " + trimmedLine; + } + } + + + if (!string.IsNullOrEmpty(currentLine)) + { + combinedLines.Add(currentLine.Trim()); + } + + combinedLines.RemoveAt(0); + + int index = combinedLines[combinedLines.Count - 1].IndexOf("X:"); + + if (index >= 0) + { + combinedLines[combinedLines.Count - 1] = combinedLines[combinedLines.Count - 1].Substring(0, index); + } + + foreach (var line in combinedLines) + { + var match = Regex.Match(line, + @"(CMD|COP|CAB|SEN)\s+(\w+)\s+([A-Za-zÀ-ÿ\s\-]+)"); + if (match.Success) + { + Crew crewMember = new Crew + { + Function = match.Groups[1].Value, + Lc = match.Groups[2].Value, + Name = match.Groups[3].Value, + }; + crewList.Add(crewMember); + } + } + + return crewList; + } +} \ No newline at end of file diff --git a/capzlog-ExtractDataFromPDF/util/FlightPlanExtractor.cs b/capzlog-ExtractDataFromPDF/util/FlightPlanExtractor.cs new file mode 100644 index 0000000..8e50410 --- /dev/null +++ b/capzlog-ExtractDataFromPDF/util/FlightPlanExtractor.cs @@ -0,0 +1,144 @@ +using System.Globalization; +using System.Text.RegularExpressions; +using capzlog_ExtractDataFromPDF.models; + +namespace capzlog_ExtractDataFromPDF; + +public class FlightPlanExtractor +{ + private string OperationText; + + public FlightPlanExtractor(string operationText) + { + OperationText = operationText; + } + + public Flight ExtractFlightPlan() + { + Flight flight = new Flight(); + flight.Info = ExtractFlightInfo(); + flight.Schedule = ExtractTimes(); + flight.MassLoad = ExtractLoadMass(); + flight.FuelData = ExtractFuel(); + flight.Correction = ExtractCorrections(); + return flight; + } + + private FlightInfo ExtractFlightInfo() + { + var datePattern = @"Date:\s(\d{2}[A-Z]{3}\d{2})"; + var registrationPattern = @"Reg\.\s*:\s*([A-Z0-9]+)"; + var aircraftTypePattern = @"Type:\s([A-Z0-9]+)"; + var departurePattern = @"From:\s([A-Z]{4})"; + var destinationPattern = @"To:\s([A-Z]{4})"; + var alternate1Pattern = @"ALTN1:\s([A-Z]{4})"; + var flightNumberPattern = @"FltNr:\s([A-Z0-9]+)"; + var atcCodePattern = @"ATC:\s([A-Z0-9]+)"; + + + FlightInfo flightInfo = new FlightInfo(); + var dateMatch = Regex.Match(OperationText, datePattern); + if (dateMatch.Success) + { + // Converti la data nel formato desiderato + string originalDate = dateMatch.Groups[1].Value; // e.g., "19MAR24" + DateTime parsedDate = DateTime.ParseExact(originalDate, "ddMMMyy", CultureInfo.InvariantCulture); + flightInfo.Date = parsedDate.ToString("dd.MM.yyyy"); // e.g., "19.03.2024" + } + + flightInfo.Registration = Regex.Match(OperationText, registrationPattern).Groups[1].Value; + flightInfo.AircraftType = Regex.Match(OperationText, aircraftTypePattern).Groups[1].Value; + flightInfo.Departure = Regex.Match(OperationText, departurePattern).Groups[1].Value; + flightInfo.Destination = Regex.Match(OperationText, destinationPattern).Groups[1].Value; + flightInfo.Alternate1 = Regex.Match(OperationText, alternate1Pattern).Groups[1].Value; + flightInfo.FlightNumber = Regex.Match(OperationText, flightNumberPattern).Groups[1].Value; + flightInfo.ATCCode = Regex.Match(OperationText, atcCodePattern).Groups[1].Value; + return flightInfo; + } + + private Times ExtractTimes() + { + // Pattern per catturare i tempi STD e STA + var timesPattern = @"STD:\s(\d{2}:\d{2})\sSTA:\s(\d{2}:\d{2})"; + + var times = new Times(); + + // Esegui il match per estrarre i tempi + var match = Regex.Match(OperationText, timesPattern); + if (match.Success) + { + times.ScheduledDepartureTime = match.Groups[1].Value; // Estrae STD + times.ScheduledArrivalTime = match.Groups[2].Value; // Estrae STA + } + + return times; + } + + private LoadMass ExtractLoadMass() + { + var zeroFuelMassPattern = @"ZFM:\s(\d+)"; + + LoadMass loadMass = new LoadMass(); + + var limcMatch = Regex.Match(OperationText, zeroFuelMassPattern); + if (limcMatch.Success) + { + loadMass.ZeroFuelMass = limcMatch.Groups[1].Value; // Fuel quantity for LIMC + } + + return loadMass; + } + + private Fuel ExtractFuel() + { + // Regular expressions to capture the values for LIMC, LIML, and MIN + var limcPattern = @"LIMC:\s([^\s]+ [^\s])"; + var limlPattern = @"LIML:\s([^\s]+ [^\s])"; + var minPattern = @"MIN:\s([^\s]+ [^\s])"; + + var fuelData = new Fuel(); + + // Match for LIMC + var limcMatch = Regex.Match(OperationText, limcPattern); + if (limcMatch.Success) + { + fuelData.Limc = limcMatch.Groups[1].Value; // Fuel quantity for LIMC + } + + // Match for LIML + var limlMatch = Regex.Match(OperationText, limlPattern); + if (limlMatch.Success) + { + fuelData.Liml = limlMatch.Groups[1].Value; // Fuel quantity for LIML + } + + // Match for MIN + var minMatch = Regex.Match(OperationText, minPattern); + if (minMatch.Success) + { + fuelData.MinimumRequired = minMatch.Groups[1].Value; // Fuel quantity for MIN + } + + return fuelData; + } + + private Corrections ExtractCorrections() + { + var gainLossPattern = @"Gain\s*/\s*Loss:\s*(GAIN|LOSS)\s*(\d+)\$/TON"; + + var corrections = new Corrections(); + + // Esegui il match per estrarre il tipo di guadagno/perdita e l'importo + var match = Regex.Match(OperationText, gainLossPattern); + if (match.Success) + { + string type = match.Groups[1].Value; // "GAIN" o "LOSS" + double amount = double.Parse(match.Groups[2].Value); // Importo numerico + + // Imposta il valore in positivo per GAIN e in negativo per LOSS + corrections.GainOrLoss = type == "GAIN" ? amount : -amount; + } + + return corrections; + } +} \ No newline at end of file