有没有可能用PdfSharp从PDF文件中提取纯文本?我不想使用iTextSharp,因为它有许可证。
发布于 2014-06-05 03:37:48
采用了Sergio的答案,并做了一些扩展方法。我还将字符串的累加过程更改为迭代器。
public static class PdfSharpExtensions
{
public static IEnumerable<string> ExtractText(this PdfPage page)
{
var content = ContentReader.ReadContent(page);
var text = content.ExtractText();
return text;
}
public static IEnumerable<string> ExtractText(this CObject cObject)
{
if (cObject is COperator)
{
var cOperator = cObject as COperator;
if (cOperator.OpCode.Name== OpCodeName.Tj.ToString() ||
cOperator.OpCode.Name == OpCodeName.TJ.ToString())
{
foreach (var cOperand in cOperator.Operands)
foreach (var txt in ExtractText(cOperand))
yield return txt;
}
}
else if (cObject is CSequence)
{
var cSequence = cObject as CSequence;
foreach (var element in cSequence)
foreach (var txt in ExtractText(element))
yield return txt;
}
else if (cObject is CString)
{
var cString = cObject as CString;
yield return cString.Value;
}
}
}
发布于 2014-05-15 09:02:08
我以某种方式实现了它,类似于David是如何做到的。下面是我的代码:
{
// ....
var page = document.Pages[1];
CObject content = ContentReader.ReadContent(page);
var extractedText = ExtractText(content);
// ...
}
private IEnumerable<string> ExtractText(CObject cObject )
{
var textList = new List<string>();
if (cObject is COperator)
{
var cOperator = cObject as COperator;
if (cOperator.OpCode.Name== OpCodeName.Tj.ToString() ||
cOperator.OpCode.Name == OpCodeName.TJ.ToString())
{
foreach (var cOperand in cOperator.Operands)
{
textList.AddRange(ExtractText(cOperand));
}
}
}
else if (cObject is CSequence)
{
var cSequence = cObject as CSequence;
foreach (var element in cSequence)
{
textList.AddRange(ExtractText(element));
}
}
else if (cObject is CString)
{
var cString = cObject as CString;
textList.Add(cString.Value);
}
return textList;
}
发布于 2013-08-01 16:36:29
PDFSharp提供了从PDF中提取文本的所有工具。使用ContentReader
类访问每个页面中的命令,并从TJ/Tj运算符提取字符串。
我已经将一个简单的实现上传到了github。
https://stackoverflow.com/questions/10141143
复制相似问题