tesseract-ocr-w64-setup-v5.3.0.20221214.exe 选择安装目录,下一步,下一步默认安装
放在安装目录下的tessdata下即可,例如D:\Program Files\Tesseract-OCR\tessdata
训练库地址:https://github.com/tesseract-ocr/tessdata
<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>5.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.bytedeco/javacv-platform -->
<dependency>
<groupId>org.bytedeco</groupId>
<artifactId>javacv-platform</artifactId>
<version>1.5.7</version>
</dependency>
public class VideoTextExtractor {
// Tesseract-OCR安装路径
public static final String pathToTessdataFolder = "D:\\Program Files\\Tesseract-OCR\\tessdata\\";
// 加载视频
public static final String pathToVideoFile = "C:\\Users\\lixiewen\\Documents\\oCam\\录制_2023_05_31_09_39_51_172.mp4";
// 解析结果
public static final String resultFile = "E:\\tmp\\tmp.txt";
public static void main(String[] args) throws TesseractException {
extracted();
}
private static void extracted() {
// 设置Tesseract OCR库的路径
File tessDataFolder = new File(pathToTessdataFolder);
System.setProperty("TESSDATA_PREFIX", tessDataFolder.getAbsolutePath());
FFmpegFrameGrabber grabber = new FFmpegFrameGrabber(pathToVideoFile);
try {
grabber.start();
Set<String> set = new LinkedHashSet<>();
// 遍历视频帧
int lengthInFrames = grabber.getLengthInFrames();
for (int i = 0; i < lengthInFrames; i++) {
System.out.println("进度 " + i + " / " + lengthInFrames);
try {
Frame frame = grabber.grabImage();
BufferedImage bufferedImage = Java2DFrameUtils.toBufferedImage(frame);
// 将帧转换为灰度图像
BufferedImage grayImage = new BufferedImage(bufferedImage.getWidth(), bufferedImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics2D graphics = grayImage.createGraphics();
graphics.drawImage(bufferedImage, 0, 0, null);
graphics.dispose();
// 创建临时文件保存图像
File tempImageFile = File.createTempFile("frame", ".png");
ImageIO.write(grayImage, "png", tempImageFile);
Tesseract tesseract = getTesseract(tessDataFolder);
String result = tesseract.doOCR(tempImageFile);
set.add(result);
// 删除临时文件
tempImageFile.delete();
} catch (Exception e) {
e.printStackTrace();
}
}
File file = new File(resultFile);
FileUtils.write2File(file, new ArrayList<>(set));
grabber.stop();
} catch (Exception e) {
e.printStackTrace();
}
}
private static Tesseract getTesseract(File tessDataFolder) {
// 使用Tesseract OCR进行文字识别
Tesseract tesseract = new Tesseract();
// 设置中文训练库 中文准确率还是不如百度Orc
tesseract.setLanguage("chi_sim");
tesseract.setDatapath(tessDataFolder.getAbsolutePath());
return tesseract;
}
}
public static ITesseract getTesseract() throws Exception {
// 使用 Tesseract 识别文本
ITesseract tesseract = new Tesseract();
// 设置训练数据文件夹路径
tesseract.setDatapath("src/main/resources/traineddata");
// 设置为中文简体
tesseract.setLanguage("chi_sim");
return tesseract;
}