diff --git a/capzlog-ExtractDataFromPDF/ExtractText.cs b/capzlog-ExtractDataFromPDF/ExtractText.cs new file mode 100644 index 0000000..245c3ea --- /dev/null +++ b/capzlog-ExtractDataFromPDF/ExtractText.cs @@ -0,0 +1,32 @@ +using UglyToad.PdfPig; +using UglyToad.PdfPig.DocumentLayoutAnalysis; +using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; +using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; +using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; + +namespace capzlog_ExtractDataFromPDF; + +public class ExtractText +{ + public IEnumerable ExtractTextBlocks(PdfDocument document, int pageNumber) + { + if (pageNumber < 1 || pageNumber > document.NumberOfPages) + { + throw new ArgumentOutOfRangeException(nameof(pageNumber), "Page number is out of range."); + } + + var page = document.GetPage(pageNumber); + + var letters = page.Letters; + var wordExtractor = NearestNeighbourWordExtractor.Instance; + + var words = wordExtractor.GetWords(letters); + + var pageSegmenter = DocstrumBoundingBoxes.Instance; + + var textBlocks = pageSegmenter.GetBlocks(words); + + var readingOrder = UnsupervisedReadingOrderDetector.Instance; + return readingOrder.Get(textBlocks); + } +} \ No newline at end of file