object_detection.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. #include <fstream>
  2. #include <sstream>
  3. #include <opencv2/dnn.hpp>
  4. #include <opencv2/imgproc.hpp>
  5. #include <opencv2/highgui.hpp>
  6. #if defined(CV_CXX11) && defined(HAVE_THREADS)
  7. #define USE_THREADS 1
  8. #endif
  9. #ifdef USE_THREADS
  10. #include <mutex>
  11. #include <thread>
  12. #include <queue>
  13. #endif
  14. #include "common.hpp"
  15. std::string keys =
  16. "{ help h | | Print help message. }"
  17. "{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
  18. "{ zoo | models.yml | An optional path to file with preprocessing parameters }"
  19. "{ device | 0 | camera device number. }"
  20. "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
  21. "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
  22. "{ classes | | Optional path to a text file with names of classes to label detected objects. }"
  23. "{ thr | .5 | Confidence threshold. }"
  24. "{ nms | .4 | Non-maximum suppression threshold. }"
  25. "{ backend | 0 | Choose one of computation backends: "
  26. "0: automatically (by default), "
  27. "1: Halide language (http://halide-lang.org/), "
  28. "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
  29. "3: OpenCV implementation, "
  30. "4: VKCOM, "
  31. "5: CUDA }"
  32. "{ target | 0 | Choose one of target computation devices: "
  33. "0: CPU target (by default), "
  34. "1: OpenCL, "
  35. "2: OpenCL fp16 (half-float precision), "
  36. "3: VPU, "
  37. "4: Vulkan, "
  38. "6: CUDA, "
  39. "7: CUDA fp16 (half-float preprocess) }"
  40. "{ async | 0 | Number of asynchronous forwards at the same time. "
  41. "Choose 0 for synchronous mode }";
  42. using namespace cv;
  43. using namespace dnn;
  44. float confThreshold, nmsThreshold;
  45. std::vector<std::string> classes;
  46. inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
  47. const Scalar& mean, bool swapRB);
  48. void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net, int backend);
  49. void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);
  50. void callback(int pos, void* userdata);
  51. #ifdef USE_THREADS
  52. template <typename T>
  53. class QueueFPS : public std::queue<T>
  54. {
  55. public:
  56. QueueFPS() : counter(0) {}
  57. void push(const T& entry)
  58. {
  59. std::lock_guard<std::mutex> lock(mutex);
  60. std::queue<T>::push(entry);
  61. counter += 1;
  62. if (counter == 1)
  63. {
  64. // Start counting from a second frame (warmup).
  65. tm.reset();
  66. tm.start();
  67. }
  68. }
  69. T get()
  70. {
  71. std::lock_guard<std::mutex> lock(mutex);
  72. T entry = this->front();
  73. this->pop();
  74. return entry;
  75. }
  76. float getFPS()
  77. {
  78. tm.stop();
  79. double fps = counter / tm.getTimeSec();
  80. tm.start();
  81. return static_cast<float>(fps);
  82. }
  83. void clear()
  84. {
  85. std::lock_guard<std::mutex> lock(mutex);
  86. while (!this->empty())
  87. this->pop();
  88. }
  89. unsigned int counter;
  90. private:
  91. TickMeter tm;
  92. std::mutex mutex;
  93. };
  94. #endif // USE_THREADS
  95. int main(int argc, char** argv)
  96. {
  97. CommandLineParser parser(argc, argv, keys);
  98. const std::string modelName = parser.get<String>("@alias");
  99. const std::string zooFile = parser.get<String>("zoo");
  100. keys += genPreprocArguments(modelName, zooFile);
  101. parser = CommandLineParser(argc, argv, keys);
  102. parser.about("Use this script to run object detection deep learning networks using OpenCV.");
  103. if (argc == 1 || parser.has("help"))
  104. {
  105. parser.printMessage();
  106. return 0;
  107. }
  108. confThreshold = parser.get<float>("thr");
  109. nmsThreshold = parser.get<float>("nms");
  110. float scale = parser.get<float>("scale");
  111. Scalar mean = parser.get<Scalar>("mean");
  112. bool swapRB = parser.get<bool>("rgb");
  113. int inpWidth = parser.get<int>("width");
  114. int inpHeight = parser.get<int>("height");
  115. size_t asyncNumReq = parser.get<int>("async");
  116. CV_Assert(parser.has("model"));
  117. std::string modelPath = findFile(parser.get<String>("model"));
  118. std::string configPath = findFile(parser.get<String>("config"));
  119. // Open file with classes names.
  120. if (parser.has("classes"))
  121. {
  122. std::string file = parser.get<String>("classes");
  123. std::ifstream ifs(file.c_str());
  124. if (!ifs.is_open())
  125. CV_Error(Error::StsError, "File " + file + " not found");
  126. std::string line;
  127. while (std::getline(ifs, line))
  128. {
  129. classes.push_back(line);
  130. }
  131. }
  132. // Load a model.
  133. Net net = readNet(modelPath, configPath, parser.get<String>("framework"));
  134. int backend = parser.get<int>("backend");
  135. net.setPreferableBackend(backend);
  136. net.setPreferableTarget(parser.get<int>("target"));
  137. std::vector<String> outNames = net.getUnconnectedOutLayersNames();
  138. // Create a window
  139. static const std::string kWinName = "Deep learning object detection in OpenCV";
  140. namedWindow(kWinName, WINDOW_NORMAL);
  141. int initialConf = (int)(confThreshold * 100);
  142. createTrackbar("Confidence threshold, %", kWinName, &initialConf, 99, callback);
  143. // Open a video file or an image file or a camera stream.
  144. VideoCapture cap;
  145. if (parser.has("input"))
  146. cap.open(parser.get<String>("input"));
  147. else
  148. cap.open(parser.get<int>("device"));
  149. #ifdef USE_THREADS
  150. bool process = true;
  151. // Frames capturing thread
  152. QueueFPS<Mat> framesQueue;
  153. std::thread framesThread([&](){
  154. Mat frame;
  155. while (process)
  156. {
  157. cap >> frame;
  158. if (!frame.empty())
  159. framesQueue.push(frame.clone());
  160. else
  161. break;
  162. }
  163. });
  164. // Frames processing thread
  165. QueueFPS<Mat> processedFramesQueue;
  166. QueueFPS<std::vector<Mat> > predictionsQueue;
  167. std::thread processingThread([&](){
  168. std::queue<AsyncArray> futureOutputs;
  169. Mat blob;
  170. while (process)
  171. {
  172. // Get a next frame
  173. Mat frame;
  174. {
  175. if (!framesQueue.empty())
  176. {
  177. frame = framesQueue.get();
  178. if (asyncNumReq)
  179. {
  180. if (futureOutputs.size() == asyncNumReq)
  181. frame = Mat();
  182. }
  183. else
  184. framesQueue.clear(); // Skip the rest of frames
  185. }
  186. }
  187. // Process the frame
  188. if (!frame.empty())
  189. {
  190. preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);
  191. processedFramesQueue.push(frame);
  192. if (asyncNumReq)
  193. {
  194. futureOutputs.push(net.forwardAsync());
  195. }
  196. else
  197. {
  198. std::vector<Mat> outs;
  199. net.forward(outs, outNames);
  200. predictionsQueue.push(outs);
  201. }
  202. }
  203. while (!futureOutputs.empty() &&
  204. futureOutputs.front().wait_for(std::chrono::seconds(0)))
  205. {
  206. AsyncArray async_out = futureOutputs.front();
  207. futureOutputs.pop();
  208. Mat out;
  209. async_out.get(out);
  210. predictionsQueue.push({out});
  211. }
  212. }
  213. });
  214. // Postprocessing and rendering loop
  215. while (waitKey(1) < 0)
  216. {
  217. if (predictionsQueue.empty())
  218. continue;
  219. std::vector<Mat> outs = predictionsQueue.get();
  220. Mat frame = processedFramesQueue.get();
  221. postprocess(frame, outs, net, backend);
  222. if (predictionsQueue.counter > 1)
  223. {
  224. std::string label = format("Camera: %.2f FPS", framesQueue.getFPS());
  225. putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
  226. label = format("Network: %.2f FPS", predictionsQueue.getFPS());
  227. putText(frame, label, Point(0, 30), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
  228. label = format("Skipped frames: %d", framesQueue.counter - predictionsQueue.counter);
  229. putText(frame, label, Point(0, 45), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
  230. }
  231. imshow(kWinName, frame);
  232. }
  233. process = false;
  234. framesThread.join();
  235. processingThread.join();
  236. #else // USE_THREADS
  237. if (asyncNumReq)
  238. CV_Error(Error::StsNotImplemented, "Asynchronous forward is supported only with Inference Engine backend.");
  239. // Process frames.
  240. Mat frame, blob;
  241. while (waitKey(1) < 0)
  242. {
  243. cap >> frame;
  244. if (frame.empty())
  245. {
  246. waitKey();
  247. break;
  248. }
  249. preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);
  250. std::vector<Mat> outs;
  251. net.forward(outs, outNames);
  252. postprocess(frame, outs, net, backend);
  253. // Put efficiency information.
  254. std::vector<double> layersTimes;
  255. double freq = getTickFrequency() / 1000;
  256. double t = net.getPerfProfile(layersTimes) / freq;
  257. std::string label = format("Inference time: %.2f ms", t);
  258. putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
  259. imshow(kWinName, frame);
  260. }
  261. #endif // USE_THREADS
  262. return 0;
  263. }
  264. inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
  265. const Scalar& mean, bool swapRB)
  266. {
  267. static Mat blob;
  268. // Create a 4D blob from a frame.
  269. if (inpSize.width <= 0) inpSize.width = frame.cols;
  270. if (inpSize.height <= 0) inpSize.height = frame.rows;
  271. blobFromImage(frame, blob, 1.0, inpSize, Scalar(), swapRB, false, CV_8U);
  272. // Run a model.
  273. net.setInput(blob, "", scale, mean);
  274. if (net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN
  275. {
  276. resize(frame, frame, inpSize);
  277. Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
  278. net.setInput(imInfo, "im_info");
  279. }
  280. }
  281. void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net, int backend)
  282. {
  283. static std::vector<int> outLayers = net.getUnconnectedOutLayers();
  284. static std::string outLayerType = net.getLayer(outLayers[0])->type;
  285. std::vector<int> classIds;
  286. std::vector<float> confidences;
  287. std::vector<Rect> boxes;
  288. if (outLayerType == "DetectionOutput")
  289. {
  290. // Network produces output blob with a shape 1x1xNx7 where N is a number of
  291. // detections and an every detection is a vector of values
  292. // [batchId, classId, confidence, left, top, right, bottom]
  293. CV_Assert(outs.size() > 0);
  294. for (size_t k = 0; k < outs.size(); k++)
  295. {
  296. float* data = (float*)outs[k].data;
  297. for (size_t i = 0; i < outs[k].total(); i += 7)
  298. {
  299. float confidence = data[i + 2];
  300. if (confidence > confThreshold)
  301. {
  302. int left = (int)data[i + 3];
  303. int top = (int)data[i + 4];
  304. int right = (int)data[i + 5];
  305. int bottom = (int)data[i + 6];
  306. int width = right - left + 1;
  307. int height = bottom - top + 1;
  308. if (width <= 2 || height <= 2)
  309. {
  310. left = (int)(data[i + 3] * frame.cols);
  311. top = (int)(data[i + 4] * frame.rows);
  312. right = (int)(data[i + 5] * frame.cols);
  313. bottom = (int)(data[i + 6] * frame.rows);
  314. width = right - left + 1;
  315. height = bottom - top + 1;
  316. }
  317. classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.
  318. boxes.push_back(Rect(left, top, width, height));
  319. confidences.push_back(confidence);
  320. }
  321. }
  322. }
  323. }
  324. else if (outLayerType == "Region")
  325. {
  326. for (size_t i = 0; i < outs.size(); ++i)
  327. {
  328. // Network produces output blob with a shape NxC where N is a number of
  329. // detected objects and C is a number of classes + 4 where the first 4
  330. // numbers are [center_x, center_y, width, height]
  331. float* data = (float*)outs[i].data;
  332. for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
  333. {
  334. Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
  335. Point classIdPoint;
  336. double confidence;
  337. minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
  338. if (confidence > confThreshold)
  339. {
  340. int centerX = (int)(data[0] * frame.cols);
  341. int centerY = (int)(data[1] * frame.rows);
  342. int width = (int)(data[2] * frame.cols);
  343. int height = (int)(data[3] * frame.rows);
  344. int left = centerX - width / 2;
  345. int top = centerY - height / 2;
  346. classIds.push_back(classIdPoint.x);
  347. confidences.push_back((float)confidence);
  348. boxes.push_back(Rect(left, top, width, height));
  349. }
  350. }
  351. }
  352. }
  353. else
  354. CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
  355. // NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
  356. // or NMS is required if number of outputs > 1
  357. if (outLayers.size() > 1 || (outLayerType == "Region" && backend != DNN_BACKEND_OPENCV))
  358. {
  359. std::map<int, std::vector<size_t> > class2indices;
  360. for (size_t i = 0; i < classIds.size(); i++)
  361. {
  362. if (confidences[i] >= confThreshold)
  363. {
  364. class2indices[classIds[i]].push_back(i);
  365. }
  366. }
  367. std::vector<Rect> nmsBoxes;
  368. std::vector<float> nmsConfidences;
  369. std::vector<int> nmsClassIds;
  370. for (std::map<int, std::vector<size_t> >::iterator it = class2indices.begin(); it != class2indices.end(); ++it)
  371. {
  372. std::vector<Rect> localBoxes;
  373. std::vector<float> localConfidences;
  374. std::vector<size_t> classIndices = it->second;
  375. for (size_t i = 0; i < classIndices.size(); i++)
  376. {
  377. localBoxes.push_back(boxes[classIndices[i]]);
  378. localConfidences.push_back(confidences[classIndices[i]]);
  379. }
  380. std::vector<int> nmsIndices;
  381. NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, nmsIndices);
  382. for (size_t i = 0; i < nmsIndices.size(); i++)
  383. {
  384. size_t idx = nmsIndices[i];
  385. nmsBoxes.push_back(localBoxes[idx]);
  386. nmsConfidences.push_back(localConfidences[idx]);
  387. nmsClassIds.push_back(it->first);
  388. }
  389. }
  390. boxes = nmsBoxes;
  391. classIds = nmsClassIds;
  392. confidences = nmsConfidences;
  393. }
  394. for (size_t idx = 0; idx < boxes.size(); ++idx)
  395. {
  396. Rect box = boxes[idx];
  397. drawPred(classIds[idx], confidences[idx], box.x, box.y,
  398. box.x + box.width, box.y + box.height, frame);
  399. }
  400. }
  401. void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
  402. {
  403. rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));
  404. std::string label = format("%.2f", conf);
  405. if (!classes.empty())
  406. {
  407. CV_Assert(classId < (int)classes.size());
  408. label = classes[classId] + ": " + label;
  409. }
  410. int baseLine;
  411. Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  412. top = max(top, labelSize.height);
  413. rectangle(frame, Point(left, top - labelSize.height),
  414. Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
  415. putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
  416. }
  417. void callback(int pos, void*)
  418. {
  419. confThreshold = pos * 0.01f;
  420. }