add ExtractTextBlocks
This commit is contained in:
32
capzlog-ExtractDataFromPDF/ExtractText.cs
Normal file
32
capzlog-ExtractDataFromPDF/ExtractText.cs
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
using UglyToad.PdfPig;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||||
|
|
||||||
|
namespace capzlog_ExtractDataFromPDF;
|
||||||
|
|
||||||
|
public class ExtractText
|
||||||
|
{
|
||||||
|
public IEnumerable<TextBlock> ExtractTextBlocks(PdfDocument document, int pageNumber)
|
||||||
|
{
|
||||||
|
if (pageNumber < 1 || pageNumber > document.NumberOfPages)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException(nameof(pageNumber), "Page number is out of range.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var page = document.GetPage(pageNumber);
|
||||||
|
|
||||||
|
var letters = page.Letters;
|
||||||
|
var wordExtractor = NearestNeighbourWordExtractor.Instance;
|
||||||
|
|
||||||
|
var words = wordExtractor.GetWords(letters);
|
||||||
|
|
||||||
|
var pageSegmenter = DocstrumBoundingBoxes.Instance;
|
||||||
|
|
||||||
|
var textBlocks = pageSegmenter.GetBlocks(words);
|
||||||
|
|
||||||
|
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
|
||||||
|
return readingOrder.Get(textBlocks);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user