From 728718ffe56ad3916e074da073ed43bb7d58aa34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joe=20K=C3=BCng?= Date: Wed, 6 Nov 2024 10:29:04 +0100 Subject: [PATCH] first approach indexing --- capzlog-ExtractDataFromPDF/util/indexing.cs | 43 +++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 capzlog-ExtractDataFromPDF/util/indexing.cs diff --git a/capzlog-ExtractDataFromPDF/util/indexing.cs b/capzlog-ExtractDataFromPDF/util/indexing.cs new file mode 100644 index 0000000..eea469f --- /dev/null +++ b/capzlog-ExtractDataFromPDF/util/indexing.cs @@ -0,0 +1,43 @@ + + +using UglyToad.PdfPig; + +namespace capzlog_ExtractDataFromPDF; + +public class Indexing +{ + private List<(int pageNumber, string firstLine)> _pageIndex; + private readonly PdfDocument _pdfDoc; + + public Indexing(PdfDocument pdfDoc) + { + _pdfDoc = pdfDoc; + _pageIndex = GetFirstLines(); + } + + private List<(int pageNumber, string firstLine)> GetFirstLines() + { + List<(int pageNumber, string firstLine)> index = new List<(int pageNumber, string firstLine)>(); + ExtractText extractText = new ExtractText(); + var firstLines = extractText.ExtractFirstLines(_pdfDoc); + for (int i = 1; i <= _pdfDoc.NumberOfPages; i++) + { + + + index.Add((i, firstLines[i])); + } + return index; + } + + public int GetPageNumber(string firstLine) + { + foreach (var (pageNumber, line) in _pageIndex) + { + if (line == firstLine) + { + return pageNumber; + } + } + return -1; + } +} \ No newline at end of file