added extractor

This commit is contained in:
Joe Küng
2024-11-06 16:26:32 +01:00
parent 7295ce30af
commit b77fae011c
6 changed files with 293 additions and 143 deletions

View File

@@ -8,84 +8,31 @@ namespace capzlog_ExtractDataFromPDF
{ {
public class ParseCzech public class ParseCzech
{ {
public static readonly String DEST = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\text.txt";
public static readonly String SRC = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\Task 1 - Extract Data from a PDF File - Sample File.pdf"; public static readonly String SRC = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\Task 1 - Extract Data from a PDF File - Sample File.pdf";
public static void Main(String[] args) public static void Main(String[] args)
{ {
using (var pdfDoc = PdfDocument.Open(SRC)) using (var pdfDoc = PdfDocument.Open(SRC))
{ {
//Prova prova = new Prova();
//prova.ProvaRead(SRC);
// var testoEstratto = EstraiTestoDaPosizione(SRC, pagina: 85, xMin: 400, yMin: 400, xMax: 800, yMax: 800);
//Console.WriteLine("Testo Estratto:");
// Console.WriteLine(testoEstratto);
//EstraiCrewConRegex(SRC, 85);
SinglePageReader reader = new SinglePageReader(); SinglePageReader reader = new SinglePageReader();
reader.GetCrewAndFlightAssignment(pdfDoc, 89); string content = reader.GetCrewAndFlightAssignment(pdfDoc, 89);
BriefingExtractor briefingExtractor = new BriefingExtractor(content);
Console.WriteLine(""+briefingExtractor.ExtractCrew()[0].Function);
Console.WriteLine(briefingExtractor.ExtractPassengers().Business);
Console.WriteLine(briefingExtractor.ExtractFlightAssignment().DOI);
string content2 = reader.GetCrewAndFlightAssignment(pdfDoc, 12);
FlightPlanExtractor flightPlanExtractor = new FlightPlanExtractor(content2);
Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().FuelData.Limc);
Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().MassLoad.ZeroFuelMass);
Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().Schedule.ScheduledArrivalTime);
Console.WriteLine("GainLoss: "+flightPlanExtractor.ExtractFlightPlan().Correction.GainOrLoss);
} }
static string EstraiTestoDaPosizione(string filePath, int pagina, double xMin, double yMin, double xMax, double yMax)
{
using (PdfDocument pdf = PdfDocument.Open(filePath))
{
var paginaPdf = pdf.GetPage(pagina);
List<string> testoNellaZona = new List<string>();
foreach (var parola in paginaPdf.GetWords())
{
var posizione = parola.BoundingBox;
if (posizione.Left >= xMin && posizione.Right <= xMax &&
posizione.Bottom >= yMin && posizione.Top <= yMax)
{
testoNellaZona.Add(parola.Text);
}
}
return string.Join(" ", testoNellaZona);
}
}
static void EstraiCrewConRegex(string filePath, int pagina)
{
List<string> crew = new List<string>();
using (PdfDocument pdf = PdfDocument.Open(filePath))
{
var paginaPdf = pdf.GetPage(pagina);
string testo = paginaPdf.Text;
// Regex per catturare ruolo, codice, nome e posizione in maniera compatta
var regex = new Regex(@"\b(CMD|COP|CAB|SEN)\s(\w+)\s([A-Za-z]+\s[A-Za-z]+)\s(.+?)(?=\bCMD|\bCOP|\bCAB|\bSEN|Observer|Contacts|Crew|\Z)");
foreach (Match match in regex.Matches(testo))
{
string ruolo = match.Groups[1].Value;
string codice = match.Groups[2].Value;
string nome = match.Groups[3].Value;
string posizione = match.Groups[4].Value;
crew.Add($"{ruolo} {codice} {nome} - {posizione}");
}
}
}
//
// reader.ReadPage(SRC, 85); // Read page 2 for example
//
//FileInfo file = new FileInfo(DEST);
//file.Directory.Create();
//new ParseCzech().ManipulatePdf(DEST);
} }

View File

@@ -8,79 +8,17 @@ namespace capzlog_ExtractDataFromPDF;
//GET flightAssigment and flight crew //GET flightAssigment and flight crew
public class SinglePageReader public class SinglePageReader
{ {
// Method to read a specific page
// public void ReadPage(string pdfPath, int pageNumber)
// {
// using (PdfDocument pdfDoc = new PdfDocument(new PdfReader(pdfPath)))
// {
// if (pageNumber < 1 || pageNumber > pdfDoc.GetNumberOfPages())
// {
// Console.WriteLine($"Page {pageNumber} does not exist in the document.");
// return;
// }
//
// // Create a text extraction renderer
// LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
//
// // Process the specified page content
// PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
// parser.ProcessPageContent(pdfDoc.GetPage(pageNumber));
//
// // Extracted text from the specified page
// string pageText = strategy.GetResultantText();
//
// // Log the extracted text to the console (for now, we don't return anything)
// Console.WriteLine($"Text from Page {pageNumber}:");
// Console.WriteLine(pageText);
// }
// }
public void GetCrewAndFlightAssignment(PdfDocument pdfDocument, int pageNumber)
public string GetCrewAndFlightAssignment(PdfDocument pdfDocument, int pageNumber)
{ {
ExtractText extractText = new ExtractText(); ExtractText extractText = new ExtractText();
var textBlocks = extractText.ExtractTextBlocks(pdfDocument, pageNumber); var textBlocks = extractText.ExtractTextBlocks(pdfDocument, pageNumber);
return textBlocks;
Console.WriteLine(textBlocks);
} }
private List<Crew> GetCrewData(IEnumerable<TextBlock> textBlocks)
{
List<Crew> crewList = new List<Crew>();
var data = textBlocks.ElementAt(0).Text.Split('\n').ToList();
var nameraw = textBlocks.ElementAt(3).Text.Split('\n').ToList();
if (data.Count>0)
{
for (int i = 2; i < data.Count; i++)
{
var crew = new Crew();
crew.Function = data[i];
string[] parts = nameraw[i-1].Split(new[] { ' ' }, 2); // Divide al primo spazio
string part1 = parts[0]; // "VEN"
string part2 = parts.Length > 1 ? parts[1] : ""; // "Nico Verhelst"
crew.Name = part2;
crew.Lc = part1;
crewList.Add(crew);
}
}
return crewList;
}
} }

View File

@@ -0,0 +1,22 @@
namespace capzlog_ExtractDataFromPDF.models
{
public class Crew
{
public string Function { get; set; } = "";
public string Lc { get; set; } = "";
public string Name { get; set; } = "";
}
public class FlightAssigment
{
public double DOW { get; set; } = 0;
public double DOI { get; set; } = 0;
}
public class Passegers
{
public int Business { get; set; } = 0;
public int Economy { get; set; } = 0;
}
}

View File

@@ -1,10 +0,0 @@
namespace capzlog_ExtractDataFromPDF.models;
public class Crew
{
public string Function { get; set; } = "";
public string Lc { get; set; } = "";
public string Name { get; set; } = "";
public string FunctionExtended { get; set; } = "";
}

View File

@@ -0,0 +1,109 @@
using System.Text.RegularExpressions;
using capzlog_ExtractDataFromPDF.models;
namespace capzlog_ExtractDataFromPDF;
public class BriefingExtractor
{
private string briefingText;
public BriefingExtractor(string briefingText)
{
this.briefingText = briefingText;
}
public FlightAssigment ExtractFlightAssignment()
{
FlightAssigment flightAssignment = new FlightAssigment();
var dowMatch = Regex.Match(briefingText, @"DOW:\s*(\d+(?:\.\d+)?)kg");
if (dowMatch.Success)
{
flightAssignment.DOW = double.Parse(dowMatch.Groups[1].Value);
}
var doiMatch = Regex.Match(briefingText, @"DOI:\s*(\d+(?:\.\d+)?)");
if (doiMatch.Success)
{
flightAssignment.DOI = double.Parse(doiMatch.Groups[1].Value);
}
return flightAssignment;
}
public Passegers ExtractPassengers()
{
Passegers passengers = new Passegers();
var paxMatch = Regex.Match(briefingText, @"\d+\/(\d+)");
if (paxMatch.Success)
{
passengers.Business = int.Parse(paxMatch.Groups[0].Value.Split("/")[0]);;
passengers.Economy = int.Parse(paxMatch.Groups[1].Value);
}
return passengers;
}
//TODO: check for multiple crew members in the same function
public List<Crew> ExtractCrew()
{
List<Crew> crewList = new List<Crew>();
var lines = briefingText.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);
List<string> combinedLines = new List<string>();
string currentLine = "";
foreach (var line in lines)
{
var trimmedLine = line.Trim();
if (Regex.IsMatch(trimmedLine, @"^(CMD|COP|CAB|SEN)\s[A-Z]{3}"))
{
if (!string.IsNullOrEmpty(currentLine))
{
combinedLines.Add(currentLine.Trim());
}
currentLine = trimmedLine;
}
else
{
currentLine += " " + trimmedLine;
}
}
if (!string.IsNullOrEmpty(currentLine))
{
combinedLines.Add(currentLine.Trim());
}
combinedLines.RemoveAt(0);
int index = combinedLines[combinedLines.Count - 1].IndexOf("X:");
if (index >= 0)
{
combinedLines[combinedLines.Count - 1] = combinedLines[combinedLines.Count - 1].Substring(0, index);
}
foreach (var line in combinedLines)
{
var match = Regex.Match(line,
@"(CMD|COP|CAB|SEN)\s+(\w+)\s+([A-Za-zÀ-ÿ\s\-]+)");
if (match.Success)
{
Crew crewMember = new Crew
{
Function = match.Groups[1].Value,
Lc = match.Groups[2].Value,
Name = match.Groups[3].Value,
};
crewList.Add(crewMember);
}
}
return crewList;
}
}

View File

@@ -0,0 +1,144 @@
using System.Globalization;
using System.Text.RegularExpressions;
using capzlog_ExtractDataFromPDF.models;
namespace capzlog_ExtractDataFromPDF;
public class FlightPlanExtractor
{
private string OperationText;
public FlightPlanExtractor(string operationText)
{
OperationText = operationText;
}
public Flight ExtractFlightPlan()
{
Flight flight = new Flight();
flight.Info = ExtractFlightInfo();
flight.Schedule = ExtractTimes();
flight.MassLoad = ExtractLoadMass();
flight.FuelData = ExtractFuel();
flight.Correction = ExtractCorrections();
return flight;
}
private FlightInfo ExtractFlightInfo()
{
var datePattern = @"Date:\s(\d{2}[A-Z]{3}\d{2})";
var registrationPattern = @"Reg\.\s*:\s*([A-Z0-9]+)";
var aircraftTypePattern = @"Type:\s([A-Z0-9]+)";
var departurePattern = @"From:\s([A-Z]{4})";
var destinationPattern = @"To:\s([A-Z]{4})";
var alternate1Pattern = @"ALTN1:\s([A-Z]{4})";
var flightNumberPattern = @"FltNr:\s([A-Z0-9]+)";
var atcCodePattern = @"ATC:\s([A-Z0-9]+)";
FlightInfo flightInfo = new FlightInfo();
var dateMatch = Regex.Match(OperationText, datePattern);
if (dateMatch.Success)
{
// Converti la data nel formato desiderato
string originalDate = dateMatch.Groups[1].Value; // e.g., "19MAR24"
DateTime parsedDate = DateTime.ParseExact(originalDate, "ddMMMyy", CultureInfo.InvariantCulture);
flightInfo.Date = parsedDate.ToString("dd.MM.yyyy"); // e.g., "19.03.2024"
}
flightInfo.Registration = Regex.Match(OperationText, registrationPattern).Groups[1].Value;
flightInfo.AircraftType = Regex.Match(OperationText, aircraftTypePattern).Groups[1].Value;
flightInfo.Departure = Regex.Match(OperationText, departurePattern).Groups[1].Value;
flightInfo.Destination = Regex.Match(OperationText, destinationPattern).Groups[1].Value;
flightInfo.Alternate1 = Regex.Match(OperationText, alternate1Pattern).Groups[1].Value;
flightInfo.FlightNumber = Regex.Match(OperationText, flightNumberPattern).Groups[1].Value;
flightInfo.ATCCode = Regex.Match(OperationText, atcCodePattern).Groups[1].Value;
return flightInfo;
}
private Times ExtractTimes()
{
// Pattern per catturare i tempi STD e STA
var timesPattern = @"STD:\s(\d{2}:\d{2})\sSTA:\s(\d{2}:\d{2})";
var times = new Times();
// Esegui il match per estrarre i tempi
var match = Regex.Match(OperationText, timesPattern);
if (match.Success)
{
times.ScheduledDepartureTime = match.Groups[1].Value; // Estrae STD
times.ScheduledArrivalTime = match.Groups[2].Value; // Estrae STA
}
return times;
}
private LoadMass ExtractLoadMass()
{
var zeroFuelMassPattern = @"ZFM:\s(\d+)";
LoadMass loadMass = new LoadMass();
var limcMatch = Regex.Match(OperationText, zeroFuelMassPattern);
if (limcMatch.Success)
{
loadMass.ZeroFuelMass = limcMatch.Groups[1].Value; // Fuel quantity for LIMC
}
return loadMass;
}
private Fuel ExtractFuel()
{
// Regular expressions to capture the values for LIMC, LIML, and MIN
var limcPattern = @"LIMC:\s([^\s]+ [^\s])";
var limlPattern = @"LIML:\s([^\s]+ [^\s])";
var minPattern = @"MIN:\s([^\s]+ [^\s])";
var fuelData = new Fuel();
// Match for LIMC
var limcMatch = Regex.Match(OperationText, limcPattern);
if (limcMatch.Success)
{
fuelData.Limc = limcMatch.Groups[1].Value; // Fuel quantity for LIMC
}
// Match for LIML
var limlMatch = Regex.Match(OperationText, limlPattern);
if (limlMatch.Success)
{
fuelData.Liml = limlMatch.Groups[1].Value; // Fuel quantity for LIML
}
// Match for MIN
var minMatch = Regex.Match(OperationText, minPattern);
if (minMatch.Success)
{
fuelData.MinimumRequired = minMatch.Groups[1].Value; // Fuel quantity for MIN
}
return fuelData;
}
private Corrections ExtractCorrections()
{
var gainLossPattern = @"Gain\s*/\s*Loss:\s*(GAIN|LOSS)\s*(\d+)\$/TON";
var corrections = new Corrections();
// Esegui il match per estrarre il tipo di guadagno/perdita e l'importo
var match = Regex.Match(OperationText, gainLossPattern);
if (match.Success)
{
string type = match.Groups[1].Value; // "GAIN" o "LOSS"
double amount = double.Parse(match.Groups[2].Value); // Importo numerico
// Imposta il valore in positivo per GAIN e in negativo per LOSS
corrections.GainOrLoss = type == "GAIN" ? amount : -amount;
}
return corrections;
}
}