text_detection.cpp 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. /*
  2. Text detection model: https://github.com/argman/EAST
  3. Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
  4. Text recognition models can be downloaded directly here:
  5. Download link: https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing
  6. and doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
  7. How to convert from pb to onnx:
  8. Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py
  9. import torch
  10. from models.crnn import CRNN
  11. model = CRNN(32, 1, 37, 256)
  12. model.load_state_dict(torch.load('crnn.pth'))
  13. dummy_input = torch.randn(1, 1, 32, 100)
  14. torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
  15. For more information, please refer to doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown and doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
  16. */
  17. #include <iostream>
  18. #include <fstream>
  19. #include <opencv2/imgproc.hpp>
  20. #include <opencv2/highgui.hpp>
  21. #include <opencv2/dnn.hpp>
  22. using namespace cv;
  23. using namespace cv::dnn;
  24. const char* keys =
  25. "{ help h | | Print help message. }"
  26. "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
  27. "{ detModel dmp | | Path to a binary .pb file contains trained detector network.}"
  28. "{ width | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
  29. "{ height | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
  30. "{ thr | 0.5 | Confidence threshold. }"
  31. "{ nms | 0.4 | Non-maximum suppression threshold. }"
  32. "{ recModel rmp | | Path to a binary .onnx file contains trained CRNN text recognition model. "
  33. "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
  34. "{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
  35. "{ vocabularyPath vp | alphabet_36.txt | Path to benchmarks for evaluation. "
  36. "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
  37. void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
  38. int main(int argc, char** argv)
  39. {
  40. // Parse command line arguments.
  41. CommandLineParser parser(argc, argv, keys);
  42. parser.about("Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of "
  43. "EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)");
  44. if (argc == 1 || parser.has("help"))
  45. {
  46. parser.printMessage();
  47. return 0;
  48. }
  49. float confThreshold = parser.get<float>("thr");
  50. float nmsThreshold = parser.get<float>("nms");
  51. int width = parser.get<int>("width");
  52. int height = parser.get<int>("height");
  53. int imreadRGB = parser.get<int>("RGBInput");
  54. String detModelPath = parser.get<String>("detModel");
  55. String recModelPath = parser.get<String>("recModel");
  56. String vocPath = parser.get<String>("vocabularyPath");
  57. if (!parser.check())
  58. {
  59. parser.printErrors();
  60. return 1;
  61. }
  62. // Load networks.
  63. CV_Assert(!detModelPath.empty() && !recModelPath.empty());
  64. TextDetectionModel_EAST detector(detModelPath);
  65. detector.setConfidenceThreshold(confThreshold)
  66. .setNMSThreshold(nmsThreshold);
  67. TextRecognitionModel recognizer(recModelPath);
  68. // Load vocabulary
  69. CV_Assert(!vocPath.empty());
  70. std::ifstream vocFile;
  71. vocFile.open(samples::findFile(vocPath));
  72. CV_Assert(vocFile.is_open());
  73. String vocLine;
  74. std::vector<String> vocabulary;
  75. while (std::getline(vocFile, vocLine)) {
  76. vocabulary.push_back(vocLine);
  77. }
  78. recognizer.setVocabulary(vocabulary);
  79. recognizer.setDecodeType("CTC-greedy");
  80. // Parameters for Recognition
  81. double recScale = 1.0 / 127.5;
  82. Scalar recMean = Scalar(127.5, 127.5, 127.5);
  83. Size recInputSize = Size(100, 32);
  84. recognizer.setInputParams(recScale, recInputSize, recMean);
  85. // Parameters for Detection
  86. double detScale = 1.0;
  87. Size detInputSize = Size(width, height);
  88. Scalar detMean = Scalar(123.68, 116.78, 103.94);
  89. bool swapRB = true;
  90. detector.setInputParams(detScale, detInputSize, detMean, swapRB);
  91. // Open a video file or an image file or a camera stream.
  92. VideoCapture cap;
  93. bool openSuccess = parser.has("input") ? cap.open(parser.get<String>("input")) : cap.open(0);
  94. CV_Assert(openSuccess);
  95. static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
  96. Mat frame;
  97. while (waitKey(1) < 0)
  98. {
  99. cap >> frame;
  100. if (frame.empty())
  101. {
  102. waitKey();
  103. break;
  104. }
  105. std::cout << frame.size << std::endl;
  106. // Detection
  107. std::vector< std::vector<Point> > detResults;
  108. detector.detect(frame, detResults);
  109. Mat frame2 = frame.clone();
  110. if (detResults.size() > 0) {
  111. // Text Recognition
  112. Mat recInput;
  113. if (!imreadRGB) {
  114. cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
  115. } else {
  116. recInput = frame;
  117. }
  118. std::vector< std::vector<Point> > contours;
  119. for (uint i = 0; i < detResults.size(); i++)
  120. {
  121. const auto& quadrangle = detResults[i];
  122. CV_CheckEQ(quadrangle.size(), (size_t)4, "");
  123. contours.emplace_back(quadrangle);
  124. std::vector<Point2f> quadrangle_2f;
  125. for (int j = 0; j < 4; j++)
  126. quadrangle_2f.emplace_back(quadrangle[j]);
  127. Mat cropped;
  128. fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
  129. std::string recognitionResult = recognizer.recognize(cropped);
  130. std::cout << i << ": '" << recognitionResult << "'" << std::endl;
  131. putText(frame2, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255), 2);
  132. }
  133. polylines(frame2, contours, true, Scalar(0, 255, 0), 2);
  134. }
  135. imshow(kWinName, frame2);
  136. }
  137. return 0;
  138. }
  139. void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
  140. {
  141. const Size outputSize = Size(100, 32);
  142. Point2f targetVertices[4] = {
  143. Point(0, outputSize.height - 1),
  144. Point(0, 0), Point(outputSize.width - 1, 0),
  145. Point(outputSize.width - 1, outputSize.height - 1)
  146. };
  147. Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
  148. warpPerspective(frame, result, rotationMatrix, outputSize);
  149. }