audio_spectrogram.cpp 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071
  1. #include <opencv2/core.hpp>
  2. #include <opencv2/videoio.hpp>
  3. #include <opencv2/highgui.hpp>
  4. #include <opencv2/imgproc.hpp>
  5. #include <iostream>
  6. #include <vector>
  7. #include <string>
  8. #include <cmath>
  9. using namespace cv;
  10. using namespace std;
  11. class AudioDrawing
  12. {
  13. public:
  14. AudioDrawing(const CommandLineParser& parser) {
  15. if (!initAndCheckArgs(parser))
  16. {
  17. cerr << "Error: Wrong input arguments" << endl;
  18. exit(0);
  19. }
  20. Draw();
  21. }
  22. void Draw() {
  23. if (draw == "static")
  24. {
  25. vector<int>inputAudio = {};
  26. int samplingRate = 0;
  27. if (inputType == "file")
  28. {
  29. samplingRate = readAudioFile(audio, inputAudio);
  30. }
  31. else if (inputType == "microphone")
  32. {
  33. samplingRate = readAudioMicrophone(inputAudio);
  34. }
  35. if ((inputAudio.size() == 0) || samplingRate <= 0)
  36. {
  37. cerr << "Error: problems with audio reading, check input arguments" << endl;
  38. return;
  39. }
  40. int duration = static_cast<int>(inputAudio.size()) / samplingRate;
  41. // since the dimensional grid is counted in integer seconds,
  42. // if the input audio has an incomplete last second,
  43. // then it is filled with zeros to complete
  44. int remainder = static_cast<int>(inputAudio.size()) % samplingRate;
  45. if (remainder)
  46. {
  47. int sizeToFullSec = samplingRate - remainder;
  48. for (int j = 0; j < sizeToFullSec; ++j)
  49. {
  50. inputAudio.push_back(0);
  51. }
  52. duration += 1;
  53. cout << "Update duration of audio to full last second with " <<
  54. sizeToFullSec << " zero samples" << endl;
  55. cout << "New number of samples " << inputAudio.size() << endl;
  56. }
  57. cout << "Duration of audio = " << duration << " seconds" << endl;
  58. // since the dimensional grid is counted in integer seconds,
  59. // if duration of file is less than xmarkup, to avoid an incorrect display,
  60. // xmarkup will be taken equal to duration
  61. if (duration <= xmarkup)
  62. {
  63. xmarkup = duration + 1;
  64. }
  65. if (graph == "ampl")
  66. {
  67. Mat imgAmplitude = drawAmplitude(inputAudio);
  68. imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
  69. imshow("Display amplitude graph", imgAmplitude);
  70. waitKey(0);
  71. }
  72. else if (graph == "spec")
  73. {
  74. vector<vector<double>>stft = STFT(inputAudio);
  75. Mat imgSpec = drawSpectrogram(stft);
  76. imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
  77. imshow("Display spectrogram", imgSpec);
  78. waitKey(0);
  79. }
  80. else if (graph == "ampl_and_spec")
  81. {
  82. Mat imgAmplitude = drawAmplitude(inputAudio);
  83. imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
  84. vector<vector<double>>stft = STFT(inputAudio);
  85. Mat imgSpec = drawSpectrogram(stft);
  86. imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
  87. Mat imgTotal = concatenateImages(imgAmplitude, imgSpec);
  88. imshow("Display amplitude graph and spectrogram", imgTotal);
  89. waitKey(0);
  90. }
  91. }
  92. else if (draw == "dynamic")
  93. {
  94. if (inputType == "file")
  95. {
  96. dynamicFile(audio);
  97. }
  98. else if (inputType == "microphone")
  99. {
  100. dynamicMicrophone();
  101. }
  102. }
  103. }
  104. ~AudioDrawing() {
  105. }
  106. int readAudioFile(string file, vector<int>& inputAudio)
  107. {
  108. VideoCapture cap;
  109. vector<int> params { CAP_PROP_AUDIO_STREAM, audioStream,
  110. CAP_PROP_VIDEO_STREAM, -1,
  111. CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
  112. cap.open(file, CAP_ANY, params);
  113. if (!cap.isOpened())
  114. {
  115. cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
  116. return -1;
  117. }
  118. const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
  119. const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
  120. cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
  121. int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
  122. cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
  123. cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
  124. cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
  125. vector<int> frameVec;
  126. Mat frame;
  127. for (;;)
  128. {
  129. if (cap.grab())
  130. {
  131. cap.retrieve(frame, audioBaseIndex);
  132. frameVec = frame;
  133. inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
  134. }
  135. else
  136. {
  137. cout << "Number of samples: " << inputAudio.size() << endl;
  138. break;
  139. }
  140. }
  141. return samplingRate;
  142. }
  143. int readAudioMicrophone(vector<int>& inputAudio)
  144. {
  145. VideoCapture cap;
  146. vector<int> params { CAP_PROP_AUDIO_STREAM, 0,
  147. CAP_PROP_VIDEO_STREAM, -1 };
  148. cap.open(0, CAP_ANY, params);
  149. if (!cap.isOpened())
  150. {
  151. cerr << "Error: Can't open microphone" << endl;
  152. return -1;
  153. }
  154. const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
  155. const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
  156. cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString( static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
  157. int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
  158. cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << samplingRate << endl;
  159. cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
  160. cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
  161. const double cvTickFreq = getTickFrequency();
  162. int64 sysTimeCurr = getTickCount();
  163. int64 sysTimePrev = sysTimeCurr;
  164. vector<int> frameVec;
  165. Mat frame;
  166. while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
  167. {
  168. if (cap.grab())
  169. {
  170. cap.retrieve(frame, audioBaseIndex);
  171. frameVec = frame;
  172. inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
  173. sysTimeCurr = getTickCount();
  174. }
  175. else
  176. {
  177. cerr << "Error: Grab error" << endl;
  178. break;
  179. }
  180. }
  181. cout << "Number of samples: " << inputAudio.size() << endl;
  182. return samplingRate;
  183. }
  184. Mat drawAmplitude(vector<int>& inputAudio)
  185. {
  186. Scalar color = Scalar(247,111,87);
  187. int thickness = 5;
  188. int frameVectorRows = 500;
  189. int middle = frameVectorRows / 2;
  190. // usually the input data is too big, so it is necessary
  191. // to reduce size using interpolation of data
  192. int frameVectorCols = 40000;
  193. if (static_cast<int>(inputAudio.size()) < frameVectorCols)
  194. {
  195. frameVectorCols = static_cast<int>(inputAudio.size());
  196. }
  197. Mat img(frameVectorRows, frameVectorCols, CV_8UC3 , Scalar(255,255,255)); // white background
  198. vector<double>reshapeAudio(inputAudio.size());
  199. for (size_t i = 0; i < inputAudio.size(); ++i)
  200. {
  201. reshapeAudio[i]=static_cast<double>(inputAudio[i]);
  202. }
  203. Mat img_frameVector( 1, static_cast<int>(reshapeAudio.size()), CV_64F , reshapeAudio.data());
  204. Mat img_frameVector_resize;
  205. resize(img_frameVector, img_frameVector_resize, Size(frameVectorCols, 1), INTER_LINEAR);
  206. reshapeAudio = img_frameVector_resize;
  207. // normalization data by maximum element
  208. normalize(reshapeAudio, reshapeAudio, 1.0, 0.0, NORM_INF);
  209. for (size_t i = 0; i < reshapeAudio.size(); ++i)
  210. {
  211. reshapeAudio[i] = middle - reshapeAudio[i] * middle;
  212. }
  213. for (int i = 1; i < static_cast<int>(reshapeAudio.size()); ++i)
  214. {
  215. line(img, Point(i-1, static_cast<int>(reshapeAudio[i-1])), Point(i, static_cast<int>(reshapeAudio[i])), color, thickness);
  216. }
  217. Mat resImage;
  218. resize(img, resImage, Size(900, 400), INTER_AREA );
  219. return resImage;
  220. }
  221. Mat drawAmplitudeScale(Mat& inputImg, const vector<int>& inputAudio, int samplingRate,
  222. int xmin = 0, int xmax = 0)
  223. {
  224. // function of layout drawing for graph of volume amplitudes
  225. // x axis for time
  226. // y axis for amplitudes
  227. // parameters for the new image size
  228. int preCol = 100;
  229. int aftCol = 100;
  230. int preLine = 40;
  231. int aftLine = 50;
  232. int frameVectorRows = inputImg.rows;
  233. int frameVectorCols = inputImg.cols;
  234. int totalRows = preLine + frameVectorRows + aftLine;
  235. int totalCols = preCol + frameVectorCols + aftCol;
  236. Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3, Scalar(255, 255, 255));
  237. inputImg.copyTo(imgTotal(Rect(preCol, preLine, inputImg.cols, inputImg.rows)));
  238. // calculating values on x axis
  239. if (xmax == 0)
  240. {
  241. xmax = static_cast<int>(inputAudio.size()) / samplingRate;
  242. }
  243. std::vector<double> xList(xmarkup);
  244. if (xmax >= xmarkup)
  245. {
  246. double deltax = (xmax - xmin) / (xmarkup - 1);
  247. for (int i = 0; i < xmarkup; ++i)
  248. {
  249. xList[i] = (xmin + deltax * i);
  250. }
  251. }
  252. else
  253. {
  254. // this case is used to display a dynamic update
  255. vector<double> tmpXList;
  256. for (int i = xmin; i < xmax; ++i)
  257. {
  258. tmpXList.push_back(i + 1);
  259. }
  260. int k = 0;
  261. for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
  262. {
  263. xList[i] = tmpXList[k];
  264. k += 1;
  265. }
  266. }
  267. // calculating values on y axis
  268. double minCv; double maxCv; Point minLoc; Point maxLoc;
  269. minMaxLoc(inputAudio, &minCv, &maxCv, &minLoc, &maxLoc);
  270. int ymin = static_cast<int>(minCv);
  271. int ymax = static_cast<int>(maxCv);
  272. std::vector<double> yList(ymarkup);
  273. double deltay = (ymax - ymin) / (ymarkup - 1);
  274. for (int i = 0; i < ymarkup; ++i)
  275. {
  276. yList[i] = ymin + deltay * i;
  277. }
  278. // parameters for layout drawing
  279. int textThickness = 1;
  280. int gridThickness = 1;
  281. Scalar gridColor(0, 0, 0);
  282. Scalar textColor(0, 0, 0);
  283. float fontScale = 0.5;
  284. // horizontal axis
  285. line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
  286. gridColor, gridThickness);
  287. // vertical axis
  288. line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
  289. gridColor, gridThickness);
  290. // parameters for layout calculation
  291. int serifSize = 10;
  292. int indentDownX = serifSize * 2;
  293. int indentDownY = serifSize / 2;
  294. int indentLeftX = serifSize;
  295. int indentLeftY = 2 * preCol / 3;
  296. // drawing layout for x axis
  297. int numX = frameVectorCols / (xmarkup - 1);
  298. for (size_t i = 0; i < xList.size(); ++i)
  299. {
  300. int a1 = static_cast<int>(preCol + i * numX);
  301. int a2 = frameVectorRows + preLine;
  302. int b1 = a1;
  303. int b2 = a2 + serifSize;
  304. if (enableGrid)
  305. {
  306. int d1 = a1;
  307. int d2 = preLine;
  308. line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
  309. }
  310. line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
  311. putText(imgTotal, to_string(int(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
  312. FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
  313. }
  314. // drawing layout for y axis
  315. int numY = frameVectorRows / (ymarkup - 1);
  316. for (size_t i = 0; i < yList.size(); ++i) {
  317. int a1 = preCol;
  318. int a2 = static_cast<int>(totalRows - aftLine - i * numY);
  319. int b1 = preCol - serifSize;
  320. int b2 = a2;
  321. if (enableGrid)
  322. {
  323. int d1 = preCol + frameVectorCols;
  324. int d2 = a2;
  325. line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
  326. }
  327. line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
  328. putText(imgTotal, to_string(int(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
  329. FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
  330. }
  331. Mat resImage;
  332. resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
  333. return resImage;
  334. }
  335. vector<vector<double>> STFT(const vector<int>& inputAudio)
  336. {
  337. // The Short-time Fourier transform (STFT), is a Fourier-related transform used to
  338. // determine the sinusoidal frequency and phase content of local sections of a signal
  339. // as it changes over time.
  340. // In practice, the procedure for computing STFTs is to divide a longer time signal
  341. // into shorter segments of equal length and then compute the Fourier transform separately
  342. // on each shorter segment. This reveals the Fourier spectrum on each shorter segment.
  343. // One then usually plots the changing spectra as a function of time, known as a spectrogram
  344. // or waterfall plot.
  345. // https://en.wikipedia.org/wiki/Short-time_Fourier_transform
  346. int timeStep = windLen - overlap;
  347. Mat dstMat;
  348. vector<double> stftRow;
  349. vector<double> WindType;
  350. if (windowType == "Hann")
  351. {
  352. // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
  353. for (int j = 1 - windLen; j < windLen; j+=2)
  354. {
  355. WindType.push_back(j * (0.5 * (1 - cos(CV_PI * j / (windLen - 1)))));
  356. }
  357. }
  358. else if (windowType == "Hamming")
  359. {
  360. // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
  361. for (int j = 1 - windLen; j < windLen; j+=2)
  362. {
  363. WindType.push_back(j * (0.53836 - 0.46164 * (cos(CV_PI * j / (windLen - 1)))));
  364. }
  365. }
  366. for (size_t i = 0; i < inputAudio.size(); i += timeStep)
  367. {
  368. vector<double>section(windLen, 0);
  369. for (int j = 0; j < windLen; ++j)
  370. {
  371. section[j] = inputAudio[j + i];
  372. }
  373. if (windowType == "Hann" || windowType == "Hamming")
  374. {
  375. for (size_t j = 0; j < section.size(); ++j)
  376. {
  377. section[j] *= WindType[j];
  378. }
  379. }
  380. dft(section, dstMat, DFT_COMPLEX_OUTPUT);
  381. for (int j = 0; j < dstMat.cols / 4; ++j)
  382. {
  383. double complModule = sqrt(dstMat.at<double>(2*j) * dstMat.at<double>(2*j) +
  384. dstMat.at<double>(2*j+1) * dstMat.at<double>(2*j+1));
  385. stftRow.push_back(complModule);
  386. }
  387. }
  388. size_t xSize = inputAudio.size() / timeStep + 1;
  389. // we need only the first part of the spectrum, the second part is symmetrical
  390. size_t ySize = dstMat.cols / 4;
  391. vector<vector<double>> stft(ySize, vector<double>(xSize, 0.));
  392. for (size_t i = 0; i < xSize; ++i)
  393. {
  394. for (size_t j = 0; j < ySize; ++j)
  395. {
  396. // write elements with transposition and convert it to the decibel scale
  397. double stftElem = stftRow[ i * ySize + j];
  398. if (stftElem != 0.)
  399. {
  400. stft[j][i] = 10 * log10(stftElem);
  401. }
  402. }
  403. }
  404. return stft;
  405. }
  406. Mat drawSpectrogram(const vector<vector<double>>& stft)
  407. {
  408. int frameVectorRows = static_cast<int>(stft.size());
  409. int frameVectorCols = static_cast<int>(stft[0].size());
  410. // Normalization of image values from 0 to 255 to get more contrast image
  411. // and this normalization will be taken into account in the scale drawing
  412. int colormapImageRows = 255;
  413. double minCv; double maxCv; Point minLoc; Point maxLoc;
  414. minMaxLoc(stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
  415. double maxStft = max(abs(maxCv), abs(minCv));
  416. for (int i = 1; i < frameVectorRows; ++i)
  417. {
  418. minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
  419. maxStft = max(maxStft, max(abs(maxCv), abs(minCv)));
  420. }
  421. // if maxStft is zero (silence)
  422. if (maxStft == 0.)
  423. {
  424. maxStft = 1;
  425. }
  426. Mat imgSpec(frameVectorRows, frameVectorCols, CV_8UC1, Scalar(255, 255, 255));
  427. for (int i = 0; i < frameVectorRows; ++i)
  428. {
  429. for (int j = 0; j < frameVectorCols; ++j)
  430. {
  431. imgSpec.at<uchar>(frameVectorRows - i - 1, j) = static_cast<uchar>(stft[i][j] * colormapImageRows / maxStft);
  432. }
  433. }
  434. applyColorMap(imgSpec, imgSpec, COLORMAP_INFERNO);
  435. Mat resImage;
  436. resize(imgSpec, resImage, Size(900, 400), INTER_AREA);
  437. return resImage;
  438. }
  439. Mat drawSpectrogramColorbar(Mat& inputImg, const vector<int>& inputAudio,
  440. int samplingRate, const vector<vector<double>>& stft,
  441. int xmin = 0, int xmax = 0)
  442. {
  443. // function of layout drawing for the three-dimensional graph of the spectrogram
  444. // x axis for time
  445. // y axis for frequencies
  446. // z axis for magnitudes of frequencies shown by color scale
  447. // parameters for the new image size
  448. int preCol = 100;
  449. int aftCol = 100;
  450. int preLine = 40;
  451. int aftLine = 50;
  452. int colColor = 20;
  453. int indCol = 20;
  454. int frameVectorRows = inputImg.rows;
  455. int frameVectorCols = inputImg.cols;
  456. int totalRows = preLine + frameVectorRows + aftLine;
  457. int totalCols = preCol + frameVectorCols + aftCol;
  458. Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
  459. inputImg.copyTo(imgTotal(Rect(preCol, preLine, frameVectorCols, frameVectorRows)));
  460. // colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0,
  461. // so here colorbar has values from 255 to 0
  462. int colorArrSize = 256;
  463. Mat imgColorBar = Mat (colorArrSize, colColor, CV_8UC1 , Scalar(255,255,255));
  464. for (int i = 0; i < colorArrSize; ++i)
  465. {
  466. for( int j = 0; j < colColor; ++j)
  467. {
  468. imgColorBar.at<uchar>(i, j) = static_cast<uchar>(colorArrSize - 1 - i); // from 255 to 0
  469. }
  470. }
  471. applyColorMap(imgColorBar, imgColorBar, COLORMAP_INFERNO);
  472. resize(imgColorBar, imgColorBar, Size(colColor, frameVectorRows), INTER_AREA);
  473. imgColorBar.copyTo(imgTotal(Rect(preCol + frameVectorCols + indCol, preLine, colColor, frameVectorRows)));
  474. // calculating values on x axis
  475. if (xmax == 0)
  476. {
  477. xmax = static_cast<int>(inputAudio.size()) / samplingRate + 1;
  478. }
  479. vector<double> xList(xmarkup, 0);
  480. if (xmax >= xmarkup)
  481. {
  482. double deltax = (xmax - xmin) / (xmarkup - 1);
  483. for(int i = 0; i < xmarkup; ++i)
  484. {
  485. xList[i] = xmin + deltax * i;
  486. }
  487. }
  488. else
  489. {
  490. // this case is used to display a dynamic update
  491. vector<double> tmpXList;
  492. for(int i = xmin; i < xmax; ++i)
  493. {
  494. tmpXList.push_back(i + 1);
  495. }
  496. int k = 0;
  497. for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
  498. {
  499. xList[i] = tmpXList[k];
  500. k += 1;
  501. }
  502. }
  503. // calculating values on y axis
  504. // according to the Nyquist sampling theorem,
  505. // signal should posses frequencies equal to half of sampling rate
  506. int ymin = 0;
  507. int ymax = static_cast<int>(samplingRate / 2);
  508. vector<double> yList;
  509. double deltay = (ymax - ymin) / (ymarkup - 1);
  510. for(int i = 0; i < ymarkup; ++i)
  511. {
  512. yList.push_back(ymin + deltay * i);
  513. }
  514. // calculating values on z axis
  515. double minCv; double maxCv; Point minLoc; Point maxLoc;
  516. minMaxLoc( stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
  517. double zmin = minCv, zmax = maxCv;
  518. std::vector<double> zList;
  519. for (size_t i = 1; i < stft.size(); ++i)
  520. {
  521. minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
  522. zmax = max(zmax, maxCv);
  523. zmin = min(zmin, minCv);
  524. }
  525. double deltaz = (zmax - zmin) / (zmarkup - 1);
  526. for(int i = 0; i < zmarkup; ++i)
  527. {
  528. zList.push_back(zmin + deltaz * i);
  529. }
  530. // parameters for layout drawing
  531. int textThickness = 1;
  532. int gridThickness = 1;
  533. Scalar gridColor(0,0,0);
  534. Scalar textColor(0,0,0);
  535. float fontScale = 0.5;
  536. int serifSize = 10;
  537. int indentDownX = serifSize * 2;
  538. int indentDownY = serifSize / 2;
  539. int indentLeftX = serifSize;
  540. int indentLeftY = 2 * preCol / 3;
  541. // horizontal axis
  542. line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
  543. gridColor, gridThickness);
  544. // vertical axis
  545. line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
  546. gridColor, gridThickness);
  547. // drawing layout for x axis
  548. int numX = frameVectorCols / (xmarkup - 1);
  549. for (size_t i = 0; i < xList.size(); ++i)
  550. {
  551. int a1 = static_cast<int>(preCol + i * numX);
  552. int a2 = frameVectorRows + preLine;
  553. int b1 = a1;
  554. int b2 = a2 + serifSize;
  555. line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
  556. putText(imgTotal, to_string(static_cast<int>(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
  557. FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
  558. }
  559. // drawing layout for y axis
  560. int numY = frameVectorRows / (ymarkup - 1);
  561. for (size_t i = 0; i < yList.size(); ++i)
  562. {
  563. int a1 = preCol;
  564. int a2 = static_cast<int>(totalRows - aftLine - i * numY);
  565. int b1 = preCol - serifSize;
  566. int b2 = a2;
  567. line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
  568. putText(imgTotal, to_string(static_cast<int>(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
  569. FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
  570. }
  571. // drawing layout for z axis
  572. int numZ = frameVectorRows / (zmarkup - 1);
  573. for (size_t i = 0; i < zList.size(); ++i)
  574. {
  575. int a1 = preCol + frameVectorCols + indCol + colColor;
  576. int a2 = static_cast<int>(totalRows - aftLine - i * numZ);
  577. int b1 = a1 + serifSize;
  578. int b2 = a2;
  579. line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
  580. putText(imgTotal, to_string(static_cast<int>(zList[i])), Point(b1 + 10, b2 + indentDownY),
  581. FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
  582. }
  583. Mat resImage;
  584. resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
  585. return resImage;
  586. }
  587. Mat concatenateImages(Mat& img1, Mat& img2)
  588. {
  589. // first image will be under the second image
  590. int totalRows = img1.rows + img2.rows;
  591. int totalCols = max(img1.cols , img2.cols);
  592. // if images columns do not match, the difference is filled in white
  593. Mat imgTotal = Mat (totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
  594. img1.copyTo(imgTotal(Rect(0, 0, img1.cols, img1.rows)));
  595. img2.copyTo(imgTotal(Rect(0, img1.rows, img2.cols, img2.rows)));
  596. return imgTotal;
  597. }
  598. void dynamicFile(const string file)
  599. {
  600. VideoCapture cap;
  601. vector<int> params { CAP_PROP_AUDIO_STREAM, audioStream,
  602. CAP_PROP_VIDEO_STREAM, -1,
  603. CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
  604. cap.open(file, CAP_ANY, params);
  605. if (!cap.isOpened())
  606. {
  607. cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
  608. return;
  609. }
  610. const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
  611. const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
  612. int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
  613. cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
  614. cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
  615. cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
  616. cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
  617. int step = static_cast<int>(updateTime * samplingRate);
  618. int frameSize = static_cast<int>(frameSizeTime * samplingRate);
  619. // since the dimensional grid is counted in integer seconds,
  620. // if duration of audio frame is less than xmarkup, to avoid an incorrect display,
  621. // xmarkup will be taken equal to duration
  622. if (frameSizeTime <= xmarkup)
  623. {
  624. xmarkup = frameSizeTime;
  625. }
  626. vector<int> buffer;
  627. vector<int> frameVector;
  628. vector<int> section(frameSize, 0);
  629. vector<vector<double>>stft;
  630. Mat frame, imgAmplitude, imgSpec, imgTotal;
  631. int currentSamples = 0;
  632. int xmin = 0;
  633. int xmax = 0;
  634. for (;;)
  635. {
  636. if (cap.grab())
  637. {
  638. cap.retrieve(frame, audioBaseIndex);
  639. frameVector = frame;
  640. buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
  641. int bufferSize = static_cast<int>(buffer.size());
  642. if (bufferSize >= step)
  643. {
  644. currentSamples += bufferSize;
  645. section.erase(section.begin(), section.begin() + step);
  646. section.insert(section.end(), buffer.begin(), buffer.end());
  647. buffer.erase(buffer.begin(), buffer.begin() + step);
  648. if (currentSamples < frameSize)
  649. {
  650. xmin = 0;
  651. xmax = (currentSamples) / samplingRate;
  652. }
  653. else
  654. {
  655. xmin = (currentSamples - frameSize) / samplingRate + 1;
  656. xmax = (currentSamples) / samplingRate;
  657. }
  658. if (graph == "ampl")
  659. {
  660. imgAmplitude = drawAmplitude(section);
  661. imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
  662. imshow("Display amplitude graph", imgAmplitude);
  663. waitKey(waitTime);
  664. }
  665. else if (graph == "spec")
  666. {
  667. stft = STFT(section);
  668. imgSpec = drawSpectrogram(stft);
  669. imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
  670. imshow("Display spectrogram", imgSpec);
  671. waitKey(waitTime);
  672. }
  673. else if (graph == "ampl_and_spec")
  674. {
  675. imgAmplitude = drawAmplitude(section);
  676. imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
  677. stft = STFT(section);
  678. imgSpec = drawSpectrogram(stft);
  679. imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
  680. imgTotal = concatenateImages(imgAmplitude, imgSpec);
  681. imshow("Display amplitude graph and spectrogram", imgTotal);
  682. waitKey(waitTime);
  683. }
  684. }
  685. }
  686. else
  687. {
  688. break;
  689. }
  690. }
  691. }
  692. void dynamicMicrophone()
  693. {
  694. VideoCapture cap;
  695. vector<int> params { CAP_PROP_AUDIO_STREAM, 0,
  696. CAP_PROP_VIDEO_STREAM, -1 };
  697. cap.open(0, CAP_MSMF, params);
  698. if (!cap.isOpened())
  699. {
  700. cerr << "Error: Can't open microphone" << endl;
  701. return;
  702. }
  703. const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
  704. const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
  705. int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
  706. cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
  707. cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
  708. cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
  709. cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
  710. const double cvTickFreq = getTickFrequency();
  711. int64 sysTimeCurr = getTickCount();
  712. int64 sysTimePrev = sysTimeCurr;
  713. int step = (updateTime * samplingRate);
  714. int frameSize = (frameSizeTime * samplingRate);
  715. // since the dimensional grid is counted in integer seconds,
  716. // if duration of audio frame is less than xmarkup, to avoid an incorrect display,
  717. // xmarkup will be taken equal to duration
  718. if (frameSizeTime <= xmarkup)
  719. {
  720. xmarkup = frameSizeTime;
  721. }
  722. vector<int> frameVector;
  723. vector<int> buffer;
  724. vector<int> section(frameSize, 0);
  725. Mat frame, imgAmplitude, imgSpec, imgTotal;
  726. int currentSamples = 0;
  727. vector<vector<double>> stft;
  728. int xmin = 0;
  729. int xmax = 0;
  730. waitTime = updateTime * 1000;
  731. while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
  732. {
  733. if (cap.grab())
  734. {
  735. cap.retrieve(frame, audioBaseIndex);
  736. frameVector = frame;
  737. buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
  738. sysTimeCurr = getTickCount();
  739. int bufferSize = static_cast<int>(buffer.size());
  740. if (bufferSize >= step)
  741. {
  742. currentSamples += step;
  743. section.erase(section.begin(), section.begin() + step);
  744. section.insert(section.end(), buffer.begin(), buffer.end());
  745. buffer.erase(buffer.begin(), buffer.begin() + step);
  746. if (currentSamples < frameSize)
  747. {
  748. xmin = 0;
  749. xmax = (currentSamples) / samplingRate;
  750. }
  751. else
  752. {
  753. xmin = (currentSamples - frameSize) / samplingRate + 1;
  754. xmax = (currentSamples) / samplingRate;
  755. }
  756. if (graph == "ampl")
  757. {
  758. imgAmplitude = drawAmplitude(section);
  759. imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
  760. imshow("Display amplitude graph", imgAmplitude);
  761. waitKey(waitTime);
  762. }
  763. else if (graph == "spec")
  764. {
  765. stft = STFT(section);
  766. imgSpec = drawSpectrogram(stft);
  767. imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
  768. imshow("Display spectrogram", imgSpec);
  769. waitKey(waitTime);
  770. }
  771. else if (graph == "ampl_and_spec")
  772. {
  773. imgAmplitude = drawAmplitude(section);
  774. imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
  775. stft = STFT(section);
  776. imgSpec = drawSpectrogram(stft);
  777. imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
  778. imgTotal = concatenateImages(imgAmplitude, imgSpec);
  779. imshow("Display amplitude graph and spectrogram", imgTotal);
  780. waitKey(waitTime);
  781. }
  782. }
  783. }
  784. else
  785. {
  786. cerr << "Error: Grab error" << endl;
  787. break;
  788. }
  789. }
  790. }
  791. bool initAndCheckArgs(const CommandLineParser& parser)
  792. {
  793. inputType = parser.get<string>("inputType");
  794. if ((inputType != "file") && (inputType != "microphone"))
  795. {
  796. cout << "Error: " << inputType << " input method doesnt exist" << endl;
  797. return false;
  798. }
  799. draw = parser.get<string>("draw");
  800. if ((draw != "static") && (draw != "dynamic"))
  801. {
  802. cout << "Error: " << draw << " draw type doesnt exist" << endl;
  803. return false;
  804. }
  805. graph = parser.get<string>("graph");
  806. if ((graph != "ampl") && (graph != "spec") && (graph != "ampl_and_spec"))
  807. {
  808. cout << "Error: " << graph << " type of graph doesnt exist" << endl;
  809. return false;
  810. }
  811. audio = samples::findFile(parser.get<std::string>("audio"));
  812. audioStream = parser.get<int>("audioStream");
  813. if (audioStream < 0)
  814. {
  815. cout << "Error: audioStream = " << audioStream << " - incorrect value. Must be >= 0" << endl;
  816. return false;
  817. }
  818. windowType = parser.get<string>("windowType");
  819. if ((windowType != "Rect") && (windowType != "Hann") && (windowType != "Hamming"))
  820. {
  821. cout << "Error: " << windowType << " type of window doesnt exist" << endl;
  822. return false;
  823. }
  824. windLen = parser.get<int>("windLen");
  825. if (windLen <= 0)
  826. {
  827. cout << "Error: windLen = " << windLen << " - incorrect value. Must be > 0" << endl;
  828. return false;
  829. }
  830. overlap = parser.get<int>("overlap");
  831. if (overlap <= 0)
  832. {
  833. cout << "Error: overlap = " << overlap << " - incorrect value. Must be > 0" << endl;
  834. return false;
  835. }
  836. enableGrid = parser.get<bool>("enableGrid");
  837. rows = parser.get<int>("rows");
  838. if (rows <= 0)
  839. {
  840. cout << "Error: rows = " << rows << " - incorrect value. Must be > 0" << endl;
  841. return false;
  842. }
  843. cols = parser.get<int>("cols");
  844. if (cols <= 0)
  845. {
  846. cout << "Error: cols = " << cols << " - incorrect value. Must be > 0" << endl;
  847. return false;
  848. }
  849. xmarkup = parser.get<int>("xmarkup");
  850. if (xmarkup < 2)
  851. {
  852. cout << "Error: xmarkup = " << xmarkup << " - incorrect value. Must be >= 2" << endl;
  853. return false;
  854. }
  855. ymarkup = parser.get<int>("ymarkup");
  856. if (ymarkup < 2)
  857. {
  858. cout << "Error: ymarkup = " << ymarkup << " - incorrect value. Must be >= 2" << endl;
  859. return false;
  860. }
  861. zmarkup = parser.get<int>("zmarkup");
  862. if (zmarkup < 2)
  863. {
  864. cout << "Error: zmarkup = " << zmarkup << " - incorrect value. Must be >= 2" << endl;
  865. return false;
  866. }
  867. microTime = parser.get<int>("microTime");
  868. if (microTime <= 0)
  869. {
  870. cout << "Error: microTime = " << microTime << " - incorrect value. Must be > 0" << endl;
  871. return false;
  872. }
  873. frameSizeTime = parser.get<int>("frameSizeTime");
  874. if (frameSizeTime <= 0)
  875. {
  876. cout << "Error: frameSizeTime = " << frameSizeTime << " - incorrect value. Must be > 0" << endl;
  877. return false;
  878. }
  879. updateTime = parser.get<int>("updateTime");
  880. if (updateTime <= 0)
  881. {
  882. cout << "Error: updateTime = " << updateTime << " - incorrect value. Must be > 0" << endl;
  883. return false;
  884. }
  885. waitTime = parser.get<int>("waitTime");
  886. if (waitTime < 0)
  887. {
  888. cout << "Error: waitTime = " << waitTime << " - incorrect value. Must be >= 0" << endl;
  889. return false;
  890. }
  891. return true;
  892. }
  893. private :
  894. string inputType;
  895. string draw;
  896. string graph;
  897. string audio;
  898. int audioStream;
  899. string windowType;
  900. int windLen;
  901. int overlap;
  902. bool enableGrid;
  903. int rows;
  904. int cols;
  905. int xmarkup;
  906. int ymarkup;
  907. int zmarkup;
  908. int microTime;
  909. int frameSizeTime;
  910. int updateTime;
  911. int waitTime;
  912. };
  913. int main(int argc, char** argv)
  914. {
  915. const String keys =
  916. "{help h usage ? | | this sample draws a volume graph and/or spectrogram of audio/video files and microphone \n\t\tDefault usage: ./Spectrogram.exe}"
  917. "{inputType i | file | file or microphone }"
  918. "{draw d | static | type of drawing: \n\t\t\tstatic - for plotting graph(s) across the entire input audio \n\t\t\tdynamic - for plotting graph(s) in a time-updating window}"
  919. "{graph g | ampl_and_spec | type of graph: amplitude graph or/and spectrogram. Please use tags below : \n\t\t\tampl - draw the amplitude graph \n\t\t\tspec - draw the spectrogram\n\t\t\tampl_and_spec - draw the amplitude graph and spectrogram on one image under each other}"
  920. "{audio a | Megamind.avi | name and path to file }"
  921. "{audioStream s | 1 | CAP_PROP_AUDIO_STREAM value. Select audio stream number }"
  922. "{windowType t | Rect | type of window for STFT. Please use tags below : \n\t\t\tRect/Hann/Hamming }"
  923. "{windLen l | 256 | size of window for STFT }"
  924. "{overlap o | 128 | overlap of windows for STFT }"
  925. "{enableGrid | false | grid on the amplitude graph }"
  926. "{rows r | 400 | rows of output image }"
  927. "{cols c | 900 | cols of output image }"
  928. "{xmarkup x | 5 | number of x axis divisions (time asix) }"
  929. "{ymarkup y | 5 | number of y axis divisions (frequency or/and amplitude axis) }"
  930. "{zmarkup z | 5 | number of z axis divisions (colorbar) }"
  931. "{microTime m | 20 | time of recording audio with microphone in seconds }"
  932. "{frameSizeTime f| 5 | size of sliding window in seconds }"
  933. "{updateTime u | 1 | update time of sliding window in seconds }"
  934. "{waitTime w | 10 | parameter to cv.waitKey() for dynamic update of file input, takes values in milliseconds }"
  935. ;
  936. CommandLineParser parser(argc, argv, keys);
  937. if (parser.has("help"))
  938. {
  939. parser.printMessage();
  940. return 0;
  941. }
  942. AudioDrawing draw(parser);
  943. return 0;
  944. }