From 9c6c61ada9042a7c80eb7c51f8f411209857a809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joe=20K=C3=BCng?= Date: Wed, 6 Nov 2024 10:28:54 +0100 Subject: [PATCH] ExtractText implemented --- capzlog-ExtractDataFromPDF/ExtractText.cs | 32 ---------------- .../util/ExtractText.cs | 37 +++++++++++++++++++ 2 files changed, 37 insertions(+), 32 deletions(-) delete mode 100644 capzlog-ExtractDataFromPDF/ExtractText.cs create mode 100644 capzlog-ExtractDataFromPDF/util/ExtractText.cs diff --git a/capzlog-ExtractDataFromPDF/ExtractText.cs b/capzlog-ExtractDataFromPDF/ExtractText.cs deleted file mode 100644 index 245c3ea..0000000 --- a/capzlog-ExtractDataFromPDF/ExtractText.cs +++ /dev/null @@ -1,32 +0,0 @@ -using UglyToad.PdfPig; -using UglyToad.PdfPig.DocumentLayoutAnalysis; -using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; -using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; -using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; - -namespace capzlog_ExtractDataFromPDF; - -public class ExtractText -{ - public IEnumerable ExtractTextBlocks(PdfDocument document, int pageNumber) - { - if (pageNumber < 1 || pageNumber > document.NumberOfPages) - { - throw new ArgumentOutOfRangeException(nameof(pageNumber), "Page number is out of range."); - } - - var page = document.GetPage(pageNumber); - - var letters = page.Letters; - var wordExtractor = NearestNeighbourWordExtractor.Instance; - - var words = wordExtractor.GetWords(letters); - - var pageSegmenter = DocstrumBoundingBoxes.Instance; - - var textBlocks = pageSegmenter.GetBlocks(words); - - var readingOrder = UnsupervisedReadingOrderDetector.Instance; - return readingOrder.Get(textBlocks); - } -} \ No newline at end of file diff --git a/capzlog-ExtractDataFromPDF/util/ExtractText.cs b/capzlog-ExtractDataFromPDF/util/ExtractText.cs new file mode 100644 index 0000000..5f4a700 --- /dev/null +++ b/capzlog-ExtractDataFromPDF/util/ExtractText.cs @@ -0,0 +1,37 @@ +using UglyToad.PdfPig; +using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; +using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; + +namespace capzlog_ExtractDataFromPDF; + +public class ExtractText +{ + public string ExtractTextBlocks(PdfDocument document, int pageNumber) + { + if (pageNumber < 1 || pageNumber > document.NumberOfPages) + { + throw new ArgumentOutOfRangeException(nameof(pageNumber), "Page number is out of range."); + } + + var page = document.GetPage(pageNumber); + + var words = page.GetWords(NearestNeighbourWordExtractor.Instance); + + var blocks = DefaultPageSegmenter.Instance.GetBlocks(words); + + return blocks[0].Text; + } + + public List ExtractFirstLines(PdfDocument document) + { + List firstLines = new List(); + for (int i = 1; i <= document.NumberOfPages; i++) + { + var page = document.GetPage(i); + var words = page.GetWords(NearestNeighbourWordExtractor.Instance); + var blocks = DefaultPageSegmenter.Instance.GetBlocks(words); + firstLines.Add(blocks[0].Text.Split("\n")[0]);; + } + return firstLines; + } +} \ No newline at end of file