ExtractText implemented
This commit is contained in:
@@ -1,32 +0,0 @@
|
||||
using UglyToad.PdfPig;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||
|
||||
namespace capzlog_ExtractDataFromPDF;
|
||||
|
||||
public class ExtractText
|
||||
{
|
||||
public IEnumerable<TextBlock> ExtractTextBlocks(PdfDocument document, int pageNumber)
|
||||
{
|
||||
if (pageNumber < 1 || pageNumber > document.NumberOfPages)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(pageNumber), "Page number is out of range.");
|
||||
}
|
||||
|
||||
var page = document.GetPage(pageNumber);
|
||||
|
||||
var letters = page.Letters;
|
||||
var wordExtractor = NearestNeighbourWordExtractor.Instance;
|
||||
|
||||
var words = wordExtractor.GetWords(letters);
|
||||
|
||||
var pageSegmenter = DocstrumBoundingBoxes.Instance;
|
||||
|
||||
var textBlocks = pageSegmenter.GetBlocks(words);
|
||||
|
||||
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
|
||||
return readingOrder.Get(textBlocks);
|
||||
}
|
||||
}
|
||||
37
capzlog-ExtractDataFromPDF/util/ExtractText.cs
Normal file
37
capzlog-ExtractDataFromPDF/util/ExtractText.cs
Normal file
@@ -0,0 +1,37 @@
|
||||
using UglyToad.PdfPig;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||
|
||||
namespace capzlog_ExtractDataFromPDF;
|
||||
|
||||
public class ExtractText
|
||||
{
|
||||
public string ExtractTextBlocks(PdfDocument document, int pageNumber)
|
||||
{
|
||||
if (pageNumber < 1 || pageNumber > document.NumberOfPages)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(pageNumber), "Page number is out of range.");
|
||||
}
|
||||
|
||||
var page = document.GetPage(pageNumber);
|
||||
|
||||
var words = page.GetWords(NearestNeighbourWordExtractor.Instance);
|
||||
|
||||
var blocks = DefaultPageSegmenter.Instance.GetBlocks(words);
|
||||
|
||||
return blocks[0].Text;
|
||||
}
|
||||
|
||||
public List<string> ExtractFirstLines(PdfDocument document)
|
||||
{
|
||||
List<string> firstLines = new List<string>();
|
||||
for (int i = 1; i <= document.NumberOfPages; i++)
|
||||
{
|
||||
var page = document.GetPage(i);
|
||||
var words = page.GetWords(NearestNeighbourWordExtractor.Instance);
|
||||
var blocks = DefaultPageSegmenter.Instance.GetBlocks(words);
|
||||
firstLines.Add(blocks[0].Text.Split("\n")[0]);;
|
||||
}
|
||||
return firstLines;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user