From 78354933d9b09489b1a8044c0224dea5d70d601d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joe=20K=C3=BCng?= Date: Tue, 5 Nov 2024 10:54:44 +0100 Subject: [PATCH] add ExtractTextBlocks --- capzlog-ExtractDataFromPDF/ExtractText.cs | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 capzlog-ExtractDataFromPDF/ExtractText.cs diff --git a/capzlog-ExtractDataFromPDF/ExtractText.cs b/capzlog-ExtractDataFromPDF/ExtractText.cs new file mode 100644 index 0000000..245c3ea --- /dev/null +++ b/capzlog-ExtractDataFromPDF/ExtractText.cs @@ -0,0 +1,32 @@ +using UglyToad.PdfPig; +using UglyToad.PdfPig.DocumentLayoutAnalysis; +using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; +using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; +using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; + +namespace capzlog_ExtractDataFromPDF; + +public class ExtractText +{ + public IEnumerable ExtractTextBlocks(PdfDocument document, int pageNumber) + { + if (pageNumber < 1 || pageNumber > document.NumberOfPages) + { + throw new ArgumentOutOfRangeException(nameof(pageNumber), "Page number is out of range."); + } + + var page = document.GetPage(pageNumber); + + var letters = page.Letters; + var wordExtractor = NearestNeighbourWordExtractor.Instance; + + var words = wordExtractor.GetWords(letters); + + var pageSegmenter = DocstrumBoundingBoxes.Instance; + + var textBlocks = pageSegmenter.GetBlocks(words); + + var readingOrder = UnsupervisedReadingOrderDetector.Instance; + return readingOrder.Get(textBlocks); + } +} \ No newline at end of file