added extractor
This commit is contained in:
@@ -8,84 +8,31 @@ namespace capzlog_ExtractDataFromPDF
|
||||
{
|
||||
public class ParseCzech
|
||||
{
|
||||
public static readonly String DEST = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\text.txt";
|
||||
public static readonly String SRC = "C:\\Users\\joeku\\RiderProjects\\capzlog-ExtractDataFromPDF\\Task 1 - Extract Data from a PDF File - Sample File.pdf";
|
||||
public static void Main(String[] args)
|
||||
{
|
||||
using (var pdfDoc = PdfDocument.Open(SRC))
|
||||
{
|
||||
//Prova prova = new Prova();
|
||||
//prova.ProvaRead(SRC);
|
||||
|
||||
|
||||
|
||||
// var testoEstratto = EstraiTestoDaPosizione(SRC, pagina: 85, xMin: 400, yMin: 400, xMax: 800, yMax: 800);
|
||||
|
||||
//Console.WriteLine("Testo Estratto:");
|
||||
// Console.WriteLine(testoEstratto);
|
||||
|
||||
//EstraiCrewConRegex(SRC, 85);
|
||||
|
||||
SinglePageReader reader = new SinglePageReader();
|
||||
reader.GetCrewAndFlightAssignment(pdfDoc, 89);
|
||||
string content = reader.GetCrewAndFlightAssignment(pdfDoc, 89);
|
||||
BriefingExtractor briefingExtractor = new BriefingExtractor(content);
|
||||
Console.WriteLine(""+briefingExtractor.ExtractCrew()[0].Function);
|
||||
Console.WriteLine(briefingExtractor.ExtractPassengers().Business);
|
||||
Console.WriteLine(briefingExtractor.ExtractFlightAssignment().DOI);
|
||||
|
||||
string content2 = reader.GetCrewAndFlightAssignment(pdfDoc, 12);
|
||||
|
||||
FlightPlanExtractor flightPlanExtractor = new FlightPlanExtractor(content2);
|
||||
|
||||
Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().FuelData.Limc);
|
||||
Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().MassLoad.ZeroFuelMass);
|
||||
Console.WriteLine(flightPlanExtractor.ExtractFlightPlan().Schedule.ScheduledArrivalTime);
|
||||
Console.WriteLine("GainLoss: "+flightPlanExtractor.ExtractFlightPlan().Correction.GainOrLoss);
|
||||
|
||||
}
|
||||
|
||||
static string EstraiTestoDaPosizione(string filePath, int pagina, double xMin, double yMin, double xMax, double yMax)
|
||||
{
|
||||
using (PdfDocument pdf = PdfDocument.Open(filePath))
|
||||
{
|
||||
var paginaPdf = pdf.GetPage(pagina);
|
||||
List<string> testoNellaZona = new List<string>();
|
||||
|
||||
foreach (var parola in paginaPdf.GetWords())
|
||||
{
|
||||
var posizione = parola.BoundingBox;
|
||||
|
||||
if (posizione.Left >= xMin && posizione.Right <= xMax &&
|
||||
posizione.Bottom >= yMin && posizione.Top <= yMax)
|
||||
{
|
||||
testoNellaZona.Add(parola.Text);
|
||||
}
|
||||
}
|
||||
|
||||
return string.Join(" ", testoNellaZona);
|
||||
}
|
||||
}
|
||||
|
||||
static void EstraiCrewConRegex(string filePath, int pagina)
|
||||
{
|
||||
List<string> crew = new List<string>();
|
||||
using (PdfDocument pdf = PdfDocument.Open(filePath))
|
||||
{
|
||||
var paginaPdf = pdf.GetPage(pagina);
|
||||
string testo = paginaPdf.Text;
|
||||
|
||||
// Regex per catturare ruolo, codice, nome e posizione in maniera compatta
|
||||
var regex = new Regex(@"\b(CMD|COP|CAB|SEN)\s(\w+)\s([A-Za-z]+\s[A-Za-z]+)\s(.+?)(?=\bCMD|\bCOP|\bCAB|\bSEN|Observer|Contacts|Crew|\Z)");
|
||||
|
||||
foreach (Match match in regex.Matches(testo))
|
||||
{
|
||||
string ruolo = match.Groups[1].Value;
|
||||
string codice = match.Groups[2].Value;
|
||||
string nome = match.Groups[3].Value;
|
||||
string posizione = match.Groups[4].Value;
|
||||
|
||||
crew.Add($"{ruolo} {codice} {nome} - {posizione}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
|
||||
// reader.ReadPage(SRC, 85); // Read page 2 for example
|
||||
//
|
||||
//FileInfo file = new FileInfo(DEST);
|
||||
//file.Directory.Create();
|
||||
|
||||
//new ParseCzech().ManipulatePdf(DEST);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -8,79 +8,17 @@ namespace capzlog_ExtractDataFromPDF;
|
||||
//GET flightAssigment and flight crew
|
||||
public class SinglePageReader
|
||||
{
|
||||
// Method to read a specific page
|
||||
// public void ReadPage(string pdfPath, int pageNumber)
|
||||
// {
|
||||
// using (PdfDocument pdfDoc = new PdfDocument(new PdfReader(pdfPath)))
|
||||
// {
|
||||
// if (pageNumber < 1 || pageNumber > pdfDoc.GetNumberOfPages())
|
||||
// {
|
||||
// Console.WriteLine($"Page {pageNumber} does not exist in the document.");
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// // Create a text extraction renderer
|
||||
// LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
|
||||
//
|
||||
// // Process the specified page content
|
||||
// PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
|
||||
// parser.ProcessPageContent(pdfDoc.GetPage(pageNumber));
|
||||
//
|
||||
// // Extracted text from the specified page
|
||||
// string pageText = strategy.GetResultantText();
|
||||
//
|
||||
// // Log the extracted text to the console (for now, we don't return anything)
|
||||
// Console.WriteLine($"Text from Page {pageNumber}:");
|
||||
// Console.WriteLine(pageText);
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
public void GetCrewAndFlightAssignment(PdfDocument pdfDocument, int pageNumber)
|
||||
|
||||
public string GetCrewAndFlightAssignment(PdfDocument pdfDocument, int pageNumber)
|
||||
{
|
||||
ExtractText extractText = new ExtractText();
|
||||
var textBlocks = extractText.ExtractTextBlocks(pdfDocument, pageNumber);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Console.WriteLine(textBlocks);
|
||||
|
||||
|
||||
|
||||
return textBlocks;
|
||||
|
||||
}
|
||||
|
||||
private List<Crew> GetCrewData(IEnumerable<TextBlock> textBlocks)
|
||||
{
|
||||
List<Crew> crewList = new List<Crew>();
|
||||
var data = textBlocks.ElementAt(0).Text.Split('\n').ToList();
|
||||
|
||||
var nameraw = textBlocks.ElementAt(3).Text.Split('\n').ToList();
|
||||
|
||||
if (data.Count>0)
|
||||
{
|
||||
for (int i = 2; i < data.Count; i++)
|
||||
{
|
||||
var crew = new Crew();
|
||||
crew.Function = data[i];
|
||||
|
||||
string[] parts = nameraw[i-1].Split(new[] { ' ' }, 2); // Divide al primo spazio
|
||||
|
||||
string part1 = parts[0]; // "VEN"
|
||||
string part2 = parts.Length > 1 ? parts[1] : ""; // "Nico Verhelst"
|
||||
|
||||
crew.Name = part2;
|
||||
crew.Lc = part1;
|
||||
|
||||
crewList.Add(crew);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return crewList;
|
||||
}
|
||||
|
||||
}
|
||||
22
capzlog-ExtractDataFromPDF/models/Crew.cs
Normal file
22
capzlog-ExtractDataFromPDF/models/Crew.cs
Normal file
@@ -0,0 +1,22 @@
|
||||
namespace capzlog_ExtractDataFromPDF.models
|
||||
{
|
||||
public class Crew
|
||||
{
|
||||
public string Function { get; set; } = "";
|
||||
public string Lc { get; set; } = "";
|
||||
public string Name { get; set; } = "";
|
||||
|
||||
}
|
||||
|
||||
public class FlightAssigment
|
||||
{
|
||||
public double DOW { get; set; } = 0;
|
||||
public double DOI { get; set; } = 0;
|
||||
}
|
||||
public class Passegers
|
||||
{
|
||||
public int Business { get; set; } = 0;
|
||||
public int Economy { get; set; } = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
namespace capzlog_ExtractDataFromPDF.models;
|
||||
|
||||
public class Crew
|
||||
{
|
||||
public string Function { get; set; } = "";
|
||||
public string Lc { get; set; } = "";
|
||||
public string Name { get; set; } = "";
|
||||
public string FunctionExtended { get; set; } = "";
|
||||
|
||||
}
|
||||
109
capzlog-ExtractDataFromPDF/util/BriefingExtractor.cs
Normal file
109
capzlog-ExtractDataFromPDF/util/BriefingExtractor.cs
Normal file
@@ -0,0 +1,109 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using capzlog_ExtractDataFromPDF.models;
|
||||
|
||||
namespace capzlog_ExtractDataFromPDF;
|
||||
|
||||
public class BriefingExtractor
|
||||
{
|
||||
private string briefingText;
|
||||
|
||||
public BriefingExtractor(string briefingText)
|
||||
{
|
||||
this.briefingText = briefingText;
|
||||
}
|
||||
|
||||
public FlightAssigment ExtractFlightAssignment()
|
||||
{
|
||||
FlightAssigment flightAssignment = new FlightAssigment();
|
||||
|
||||
var dowMatch = Regex.Match(briefingText, @"DOW:\s*(\d+(?:\.\d+)?)kg");
|
||||
if (dowMatch.Success)
|
||||
{
|
||||
flightAssignment.DOW = double.Parse(dowMatch.Groups[1].Value);
|
||||
}
|
||||
|
||||
var doiMatch = Regex.Match(briefingText, @"DOI:\s*(\d+(?:\.\d+)?)");
|
||||
if (doiMatch.Success)
|
||||
{
|
||||
flightAssignment.DOI = double.Parse(doiMatch.Groups[1].Value);
|
||||
}
|
||||
|
||||
return flightAssignment;
|
||||
}
|
||||
|
||||
public Passegers ExtractPassengers()
|
||||
{
|
||||
Passegers passengers = new Passegers();
|
||||
|
||||
var paxMatch = Regex.Match(briefingText, @"\d+\/(\d+)");
|
||||
if (paxMatch.Success)
|
||||
{
|
||||
passengers.Business = int.Parse(paxMatch.Groups[0].Value.Split("/")[0]);;
|
||||
passengers.Economy = int.Parse(paxMatch.Groups[1].Value);
|
||||
}
|
||||
|
||||
return passengers;
|
||||
}
|
||||
|
||||
//TODO: check for multiple crew members in the same function
|
||||
public List<Crew> ExtractCrew()
|
||||
{
|
||||
List<Crew> crewList = new List<Crew>();
|
||||
|
||||
var lines = briefingText.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);
|
||||
List<string> combinedLines = new List<string>();
|
||||
|
||||
string currentLine = "";
|
||||
foreach (var line in lines)
|
||||
{
|
||||
var trimmedLine = line.Trim();
|
||||
|
||||
if (Regex.IsMatch(trimmedLine, @"^(CMD|COP|CAB|SEN)\s[A-Z]{3}"))
|
||||
{
|
||||
if (!string.IsNullOrEmpty(currentLine))
|
||||
{
|
||||
combinedLines.Add(currentLine.Trim());
|
||||
}
|
||||
|
||||
currentLine = trimmedLine;
|
||||
}
|
||||
else
|
||||
{
|
||||
currentLine += " " + trimmedLine;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!string.IsNullOrEmpty(currentLine))
|
||||
{
|
||||
combinedLines.Add(currentLine.Trim());
|
||||
}
|
||||
|
||||
combinedLines.RemoveAt(0);
|
||||
|
||||
int index = combinedLines[combinedLines.Count - 1].IndexOf("X:");
|
||||
|
||||
if (index >= 0)
|
||||
{
|
||||
combinedLines[combinedLines.Count - 1] = combinedLines[combinedLines.Count - 1].Substring(0, index);
|
||||
}
|
||||
|
||||
foreach (var line in combinedLines)
|
||||
{
|
||||
var match = Regex.Match(line,
|
||||
@"(CMD|COP|CAB|SEN)\s+(\w+)\s+([A-Za-zÀ-ÿ\s\-]+)");
|
||||
if (match.Success)
|
||||
{
|
||||
Crew crewMember = new Crew
|
||||
{
|
||||
Function = match.Groups[1].Value,
|
||||
Lc = match.Groups[2].Value,
|
||||
Name = match.Groups[3].Value,
|
||||
};
|
||||
crewList.Add(crewMember);
|
||||
}
|
||||
}
|
||||
|
||||
return crewList;
|
||||
}
|
||||
}
|
||||
144
capzlog-ExtractDataFromPDF/util/FlightPlanExtractor.cs
Normal file
144
capzlog-ExtractDataFromPDF/util/FlightPlanExtractor.cs
Normal file
@@ -0,0 +1,144 @@
|
||||
using System.Globalization;
|
||||
using System.Text.RegularExpressions;
|
||||
using capzlog_ExtractDataFromPDF.models;
|
||||
|
||||
namespace capzlog_ExtractDataFromPDF;
|
||||
|
||||
public class FlightPlanExtractor
|
||||
{
|
||||
private string OperationText;
|
||||
|
||||
public FlightPlanExtractor(string operationText)
|
||||
{
|
||||
OperationText = operationText;
|
||||
}
|
||||
|
||||
public Flight ExtractFlightPlan()
|
||||
{
|
||||
Flight flight = new Flight();
|
||||
flight.Info = ExtractFlightInfo();
|
||||
flight.Schedule = ExtractTimes();
|
||||
flight.MassLoad = ExtractLoadMass();
|
||||
flight.FuelData = ExtractFuel();
|
||||
flight.Correction = ExtractCorrections();
|
||||
return flight;
|
||||
}
|
||||
|
||||
private FlightInfo ExtractFlightInfo()
|
||||
{
|
||||
var datePattern = @"Date:\s(\d{2}[A-Z]{3}\d{2})";
|
||||
var registrationPattern = @"Reg\.\s*:\s*([A-Z0-9]+)";
|
||||
var aircraftTypePattern = @"Type:\s([A-Z0-9]+)";
|
||||
var departurePattern = @"From:\s([A-Z]{4})";
|
||||
var destinationPattern = @"To:\s([A-Z]{4})";
|
||||
var alternate1Pattern = @"ALTN1:\s([A-Z]{4})";
|
||||
var flightNumberPattern = @"FltNr:\s([A-Z0-9]+)";
|
||||
var atcCodePattern = @"ATC:\s([A-Z0-9]+)";
|
||||
|
||||
|
||||
FlightInfo flightInfo = new FlightInfo();
|
||||
var dateMatch = Regex.Match(OperationText, datePattern);
|
||||
if (dateMatch.Success)
|
||||
{
|
||||
// Converti la data nel formato desiderato
|
||||
string originalDate = dateMatch.Groups[1].Value; // e.g., "19MAR24"
|
||||
DateTime parsedDate = DateTime.ParseExact(originalDate, "ddMMMyy", CultureInfo.InvariantCulture);
|
||||
flightInfo.Date = parsedDate.ToString("dd.MM.yyyy"); // e.g., "19.03.2024"
|
||||
}
|
||||
|
||||
flightInfo.Registration = Regex.Match(OperationText, registrationPattern).Groups[1].Value;
|
||||
flightInfo.AircraftType = Regex.Match(OperationText, aircraftTypePattern).Groups[1].Value;
|
||||
flightInfo.Departure = Regex.Match(OperationText, departurePattern).Groups[1].Value;
|
||||
flightInfo.Destination = Regex.Match(OperationText, destinationPattern).Groups[1].Value;
|
||||
flightInfo.Alternate1 = Regex.Match(OperationText, alternate1Pattern).Groups[1].Value;
|
||||
flightInfo.FlightNumber = Regex.Match(OperationText, flightNumberPattern).Groups[1].Value;
|
||||
flightInfo.ATCCode = Regex.Match(OperationText, atcCodePattern).Groups[1].Value;
|
||||
return flightInfo;
|
||||
}
|
||||
|
||||
private Times ExtractTimes()
|
||||
{
|
||||
// Pattern per catturare i tempi STD e STA
|
||||
var timesPattern = @"STD:\s(\d{2}:\d{2})\sSTA:\s(\d{2}:\d{2})";
|
||||
|
||||
var times = new Times();
|
||||
|
||||
// Esegui il match per estrarre i tempi
|
||||
var match = Regex.Match(OperationText, timesPattern);
|
||||
if (match.Success)
|
||||
{
|
||||
times.ScheduledDepartureTime = match.Groups[1].Value; // Estrae STD
|
||||
times.ScheduledArrivalTime = match.Groups[2].Value; // Estrae STA
|
||||
}
|
||||
|
||||
return times;
|
||||
}
|
||||
|
||||
private LoadMass ExtractLoadMass()
|
||||
{
|
||||
var zeroFuelMassPattern = @"ZFM:\s(\d+)";
|
||||
|
||||
LoadMass loadMass = new LoadMass();
|
||||
|
||||
var limcMatch = Regex.Match(OperationText, zeroFuelMassPattern);
|
||||
if (limcMatch.Success)
|
||||
{
|
||||
loadMass.ZeroFuelMass = limcMatch.Groups[1].Value; // Fuel quantity for LIMC
|
||||
}
|
||||
|
||||
return loadMass;
|
||||
}
|
||||
|
||||
private Fuel ExtractFuel()
|
||||
{
|
||||
// Regular expressions to capture the values for LIMC, LIML, and MIN
|
||||
var limcPattern = @"LIMC:\s([^\s]+ [^\s])";
|
||||
var limlPattern = @"LIML:\s([^\s]+ [^\s])";
|
||||
var minPattern = @"MIN:\s([^\s]+ [^\s])";
|
||||
|
||||
var fuelData = new Fuel();
|
||||
|
||||
// Match for LIMC
|
||||
var limcMatch = Regex.Match(OperationText, limcPattern);
|
||||
if (limcMatch.Success)
|
||||
{
|
||||
fuelData.Limc = limcMatch.Groups[1].Value; // Fuel quantity for LIMC
|
||||
}
|
||||
|
||||
// Match for LIML
|
||||
var limlMatch = Regex.Match(OperationText, limlPattern);
|
||||
if (limlMatch.Success)
|
||||
{
|
||||
fuelData.Liml = limlMatch.Groups[1].Value; // Fuel quantity for LIML
|
||||
}
|
||||
|
||||
// Match for MIN
|
||||
var minMatch = Regex.Match(OperationText, minPattern);
|
||||
if (minMatch.Success)
|
||||
{
|
||||
fuelData.MinimumRequired = minMatch.Groups[1].Value; // Fuel quantity for MIN
|
||||
}
|
||||
|
||||
return fuelData;
|
||||
}
|
||||
|
||||
private Corrections ExtractCorrections()
|
||||
{
|
||||
var gainLossPattern = @"Gain\s*/\s*Loss:\s*(GAIN|LOSS)\s*(\d+)\$/TON";
|
||||
|
||||
var corrections = new Corrections();
|
||||
|
||||
// Esegui il match per estrarre il tipo di guadagno/perdita e l'importo
|
||||
var match = Regex.Match(OperationText, gainLossPattern);
|
||||
if (match.Success)
|
||||
{
|
||||
string type = match.Groups[1].Value; // "GAIN" o "LOSS"
|
||||
double amount = double.Parse(match.Groups[2].Value); // Importo numerico
|
||||
|
||||
// Imposta il valore in positivo per GAIN e in negativo per LOSS
|
||||
corrections.GainOrLoss = type == "GAIN" ? amount : -amount;
|
||||
}
|
||||
|
||||
return corrections;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user