ExtractText implemented

This commit is contained in:
Joe Küng
2024-11-06 10:28:54 +01:00
parent 78354933d9
commit 9c6c61ada9
2 changed files with 37 additions and 32 deletions

View File

@@ -0,0 +1,37 @@
using UglyToad.PdfPig;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
namespace capzlog_ExtractDataFromPDF;
public class ExtractText
{
public string ExtractTextBlocks(PdfDocument document, int pageNumber)
{
if (pageNumber < 1 || pageNumber > document.NumberOfPages)
{
throw new ArgumentOutOfRangeException(nameof(pageNumber), "Page number is out of range.");
}
var page = document.GetPage(pageNumber);
var words = page.GetWords(NearestNeighbourWordExtractor.Instance);
var blocks = DefaultPageSegmenter.Instance.GetBlocks(words);
return blocks[0].Text;
}
public List<string> ExtractFirstLines(PdfDocument document)
{
List<string> firstLines = new List<string>();
for (int i = 1; i <= document.NumberOfPages; i++)
{
var page = document.GetPage(i);
var words = page.GetWords(NearestNeighbourWordExtractor.Instance);
var blocks = DefaultPageSegmenter.Instance.GetBlocks(words);
firstLines.Add(blocks[0].Text.Split("\n")[0]);;
}
return firstLines;
}
}