123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071 |
- #include <opencv2/core.hpp>
- #include <opencv2/videoio.hpp>
- #include <opencv2/highgui.hpp>
- #include <opencv2/imgproc.hpp>
- #include <iostream>
- #include <vector>
- #include <string>
- #include <cmath>
- using namespace cv;
- using namespace std;
- class AudioDrawing
- {
- public:
- AudioDrawing(const CommandLineParser& parser) {
- if (!initAndCheckArgs(parser))
- {
- cerr << "Error: Wrong input arguments" << endl;
- exit(0);
- }
- Draw();
- }
- void Draw() {
- if (draw == "static")
- {
- vector<int>inputAudio = {};
- int samplingRate = 0;
- if (inputType == "file")
- {
- samplingRate = readAudioFile(audio, inputAudio);
- }
- else if (inputType == "microphone")
- {
- samplingRate = readAudioMicrophone(inputAudio);
- }
- if ((inputAudio.size() == 0) || samplingRate <= 0)
- {
- cerr << "Error: problems with audio reading, check input arguments" << endl;
- return;
- }
- int duration = static_cast<int>(inputAudio.size()) / samplingRate;
- // since the dimensional grid is counted in integer seconds,
- // if the input audio has an incomplete last second,
- // then it is filled with zeros to complete
- int remainder = static_cast<int>(inputAudio.size()) % samplingRate;
- if (remainder)
- {
- int sizeToFullSec = samplingRate - remainder;
- for (int j = 0; j < sizeToFullSec; ++j)
- {
- inputAudio.push_back(0);
- }
- duration += 1;
- cout << "Update duration of audio to full last second with " <<
- sizeToFullSec << " zero samples" << endl;
- cout << "New number of samples " << inputAudio.size() << endl;
- }
- cout << "Duration of audio = " << duration << " seconds" << endl;
- // since the dimensional grid is counted in integer seconds,
- // if duration of file is less than xmarkup, to avoid an incorrect display,
- // xmarkup will be taken equal to duration
- if (duration <= xmarkup)
- {
- xmarkup = duration + 1;
- }
- if (graph == "ampl")
- {
- Mat imgAmplitude = drawAmplitude(inputAudio);
- imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
- imshow("Display amplitude graph", imgAmplitude);
- waitKey(0);
- }
- else if (graph == "spec")
- {
- vector<vector<double>>stft = STFT(inputAudio);
- Mat imgSpec = drawSpectrogram(stft);
- imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
- imshow("Display spectrogram", imgSpec);
- waitKey(0);
- }
- else if (graph == "ampl_and_spec")
- {
- Mat imgAmplitude = drawAmplitude(inputAudio);
- imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
- vector<vector<double>>stft = STFT(inputAudio);
- Mat imgSpec = drawSpectrogram(stft);
- imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
- Mat imgTotal = concatenateImages(imgAmplitude, imgSpec);
- imshow("Display amplitude graph and spectrogram", imgTotal);
- waitKey(0);
- }
- }
- else if (draw == "dynamic")
- {
- if (inputType == "file")
- {
- dynamicFile(audio);
- }
- else if (inputType == "microphone")
- {
- dynamicMicrophone();
- }
- }
- }
- ~AudioDrawing() {
- }
- int readAudioFile(string file, vector<int>& inputAudio)
- {
- VideoCapture cap;
- vector<int> params { CAP_PROP_AUDIO_STREAM, audioStream,
- CAP_PROP_VIDEO_STREAM, -1,
- CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
- cap.open(file, CAP_ANY, params);
- if (!cap.isOpened())
- {
- cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
- return -1;
- }
- const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
- const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
- cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
- int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
- cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
- cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
- cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
- vector<int> frameVec;
- Mat frame;
- for (;;)
- {
- if (cap.grab())
- {
- cap.retrieve(frame, audioBaseIndex);
- frameVec = frame;
- inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
- }
- else
- {
- cout << "Number of samples: " << inputAudio.size() << endl;
- break;
- }
- }
- return samplingRate;
- }
- int readAudioMicrophone(vector<int>& inputAudio)
- {
- VideoCapture cap;
- vector<int> params { CAP_PROP_AUDIO_STREAM, 0,
- CAP_PROP_VIDEO_STREAM, -1 };
- cap.open(0, CAP_ANY, params);
- if (!cap.isOpened())
- {
- cerr << "Error: Can't open microphone" << endl;
- return -1;
- }
- const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
- const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
- cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString( static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
- int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
- cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << samplingRate << endl;
- cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
- cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
- const double cvTickFreq = getTickFrequency();
- int64 sysTimeCurr = getTickCount();
- int64 sysTimePrev = sysTimeCurr;
- vector<int> frameVec;
- Mat frame;
- while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
- {
- if (cap.grab())
- {
- cap.retrieve(frame, audioBaseIndex);
- frameVec = frame;
- inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
- sysTimeCurr = getTickCount();
- }
- else
- {
- cerr << "Error: Grab error" << endl;
- break;
- }
- }
- cout << "Number of samples: " << inputAudio.size() << endl;
- return samplingRate;
- }
- Mat drawAmplitude(vector<int>& inputAudio)
- {
- Scalar color = Scalar(247,111,87);
- int thickness = 5;
- int frameVectorRows = 500;
- int middle = frameVectorRows / 2;
- // usually the input data is too big, so it is necessary
- // to reduce size using interpolation of data
- int frameVectorCols = 40000;
- if (static_cast<int>(inputAudio.size()) < frameVectorCols)
- {
- frameVectorCols = static_cast<int>(inputAudio.size());
- }
- Mat img(frameVectorRows, frameVectorCols, CV_8UC3 , Scalar(255,255,255)); // white background
- vector<double>reshapeAudio(inputAudio.size());
- for (size_t i = 0; i < inputAudio.size(); ++i)
- {
- reshapeAudio[i]=static_cast<double>(inputAudio[i]);
- }
- Mat img_frameVector( 1, static_cast<int>(reshapeAudio.size()), CV_64F , reshapeAudio.data());
- Mat img_frameVector_resize;
- resize(img_frameVector, img_frameVector_resize, Size(frameVectorCols, 1), INTER_LINEAR);
- reshapeAudio = img_frameVector_resize;
- // normalization data by maximum element
- normalize(reshapeAudio, reshapeAudio, 1.0, 0.0, NORM_INF);
- for (size_t i = 0; i < reshapeAudio.size(); ++i)
- {
- reshapeAudio[i] = middle - reshapeAudio[i] * middle;
- }
- for (int i = 1; i < static_cast<int>(reshapeAudio.size()); ++i)
- {
- line(img, Point(i-1, static_cast<int>(reshapeAudio[i-1])), Point(i, static_cast<int>(reshapeAudio[i])), color, thickness);
- }
- Mat resImage;
- resize(img, resImage, Size(900, 400), INTER_AREA );
- return resImage;
- }
- Mat drawAmplitudeScale(Mat& inputImg, const vector<int>& inputAudio, int samplingRate,
- int xmin = 0, int xmax = 0)
- {
- // function of layout drawing for graph of volume amplitudes
- // x axis for time
- // y axis for amplitudes
- // parameters for the new image size
- int preCol = 100;
- int aftCol = 100;
- int preLine = 40;
- int aftLine = 50;
- int frameVectorRows = inputImg.rows;
- int frameVectorCols = inputImg.cols;
- int totalRows = preLine + frameVectorRows + aftLine;
- int totalCols = preCol + frameVectorCols + aftCol;
- Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3, Scalar(255, 255, 255));
- inputImg.copyTo(imgTotal(Rect(preCol, preLine, inputImg.cols, inputImg.rows)));
- // calculating values on x axis
- if (xmax == 0)
- {
- xmax = static_cast<int>(inputAudio.size()) / samplingRate;
- }
- std::vector<double> xList(xmarkup);
- if (xmax >= xmarkup)
- {
- double deltax = (xmax - xmin) / (xmarkup - 1);
- for (int i = 0; i < xmarkup; ++i)
- {
- xList[i] = (xmin + deltax * i);
- }
- }
- else
- {
- // this case is used to display a dynamic update
- vector<double> tmpXList;
- for (int i = xmin; i < xmax; ++i)
- {
- tmpXList.push_back(i + 1);
- }
- int k = 0;
- for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
- {
- xList[i] = tmpXList[k];
- k += 1;
- }
- }
- // calculating values on y axis
- double minCv; double maxCv; Point minLoc; Point maxLoc;
- minMaxLoc(inputAudio, &minCv, &maxCv, &minLoc, &maxLoc);
- int ymin = static_cast<int>(minCv);
- int ymax = static_cast<int>(maxCv);
- std::vector<double> yList(ymarkup);
- double deltay = (ymax - ymin) / (ymarkup - 1);
- for (int i = 0; i < ymarkup; ++i)
- {
- yList[i] = ymin + deltay * i;
- }
- // parameters for layout drawing
- int textThickness = 1;
- int gridThickness = 1;
- Scalar gridColor(0, 0, 0);
- Scalar textColor(0, 0, 0);
- float fontScale = 0.5;
- // horizontal axis
- line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
- gridColor, gridThickness);
- // vertical axis
- line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
- gridColor, gridThickness);
- // parameters for layout calculation
- int serifSize = 10;
- int indentDownX = serifSize * 2;
- int indentDownY = serifSize / 2;
- int indentLeftX = serifSize;
- int indentLeftY = 2 * preCol / 3;
- // drawing layout for x axis
- int numX = frameVectorCols / (xmarkup - 1);
- for (size_t i = 0; i < xList.size(); ++i)
- {
- int a1 = static_cast<int>(preCol + i * numX);
- int a2 = frameVectorRows + preLine;
- int b1 = a1;
- int b2 = a2 + serifSize;
- if (enableGrid)
- {
- int d1 = a1;
- int d2 = preLine;
- line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
- }
- line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
- putText(imgTotal, to_string(int(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
- FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
- }
- // drawing layout for y axis
- int numY = frameVectorRows / (ymarkup - 1);
- for (size_t i = 0; i < yList.size(); ++i) {
- int a1 = preCol;
- int a2 = static_cast<int>(totalRows - aftLine - i * numY);
- int b1 = preCol - serifSize;
- int b2 = a2;
- if (enableGrid)
- {
- int d1 = preCol + frameVectorCols;
- int d2 = a2;
- line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
- }
- line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
- putText(imgTotal, to_string(int(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
- FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
- }
- Mat resImage;
- resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
- return resImage;
- }
- vector<vector<double>> STFT(const vector<int>& inputAudio)
- {
- // The Short-time Fourier transform (STFT), is a Fourier-related transform used to
- // determine the sinusoidal frequency and phase content of local sections of a signal
- // as it changes over time.
- // In practice, the procedure for computing STFTs is to divide a longer time signal
- // into shorter segments of equal length and then compute the Fourier transform separately
- // on each shorter segment. This reveals the Fourier spectrum on each shorter segment.
- // One then usually plots the changing spectra as a function of time, known as a spectrogram
- // or waterfall plot.
- // https://en.wikipedia.org/wiki/Short-time_Fourier_transform
- int timeStep = windLen - overlap;
- Mat dstMat;
- vector<double> stftRow;
- vector<double> WindType;
- if (windowType == "Hann")
- {
- // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
- for (int j = 1 - windLen; j < windLen; j+=2)
- {
- WindType.push_back(j * (0.5 * (1 - cos(CV_PI * j / (windLen - 1)))));
- }
- }
- else if (windowType == "Hamming")
- {
- // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
- for (int j = 1 - windLen; j < windLen; j+=2)
- {
- WindType.push_back(j * (0.53836 - 0.46164 * (cos(CV_PI * j / (windLen - 1)))));
- }
- }
- for (size_t i = 0; i < inputAudio.size(); i += timeStep)
- {
- vector<double>section(windLen, 0);
- for (int j = 0; j < windLen; ++j)
- {
- section[j] = inputAudio[j + i];
- }
- if (windowType == "Hann" || windowType == "Hamming")
- {
- for (size_t j = 0; j < section.size(); ++j)
- {
- section[j] *= WindType[j];
- }
- }
- dft(section, dstMat, DFT_COMPLEX_OUTPUT);
- for (int j = 0; j < dstMat.cols / 4; ++j)
- {
- double complModule = sqrt(dstMat.at<double>(2*j) * dstMat.at<double>(2*j) +
- dstMat.at<double>(2*j+1) * dstMat.at<double>(2*j+1));
- stftRow.push_back(complModule);
- }
- }
- size_t xSize = inputAudio.size() / timeStep + 1;
- // we need only the first part of the spectrum, the second part is symmetrical
- size_t ySize = dstMat.cols / 4;
- vector<vector<double>> stft(ySize, vector<double>(xSize, 0.));
- for (size_t i = 0; i < xSize; ++i)
- {
- for (size_t j = 0; j < ySize; ++j)
- {
- // write elements with transposition and convert it to the decibel scale
- double stftElem = stftRow[ i * ySize + j];
- if (stftElem != 0.)
- {
- stft[j][i] = 10 * log10(stftElem);
- }
- }
- }
- return stft;
- }
- Mat drawSpectrogram(const vector<vector<double>>& stft)
- {
- int frameVectorRows = static_cast<int>(stft.size());
- int frameVectorCols = static_cast<int>(stft[0].size());
- // Normalization of image values from 0 to 255 to get more contrast image
- // and this normalization will be taken into account in the scale drawing
- int colormapImageRows = 255;
- double minCv; double maxCv; Point minLoc; Point maxLoc;
- minMaxLoc(stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
- double maxStft = max(abs(maxCv), abs(minCv));
- for (int i = 1; i < frameVectorRows; ++i)
- {
- minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
- maxStft = max(maxStft, max(abs(maxCv), abs(minCv)));
- }
- // if maxStft is zero (silence)
- if (maxStft == 0.)
- {
- maxStft = 1;
- }
- Mat imgSpec(frameVectorRows, frameVectorCols, CV_8UC1, Scalar(255, 255, 255));
- for (int i = 0; i < frameVectorRows; ++i)
- {
- for (int j = 0; j < frameVectorCols; ++j)
- {
- imgSpec.at<uchar>(frameVectorRows - i - 1, j) = static_cast<uchar>(stft[i][j] * colormapImageRows / maxStft);
- }
- }
- applyColorMap(imgSpec, imgSpec, COLORMAP_INFERNO);
- Mat resImage;
- resize(imgSpec, resImage, Size(900, 400), INTER_AREA);
- return resImage;
- }
- Mat drawSpectrogramColorbar(Mat& inputImg, const vector<int>& inputAudio,
- int samplingRate, const vector<vector<double>>& stft,
- int xmin = 0, int xmax = 0)
- {
- // function of layout drawing for the three-dimensional graph of the spectrogram
- // x axis for time
- // y axis for frequencies
- // z axis for magnitudes of frequencies shown by color scale
- // parameters for the new image size
- int preCol = 100;
- int aftCol = 100;
- int preLine = 40;
- int aftLine = 50;
- int colColor = 20;
- int indCol = 20;
- int frameVectorRows = inputImg.rows;
- int frameVectorCols = inputImg.cols;
- int totalRows = preLine + frameVectorRows + aftLine;
- int totalCols = preCol + frameVectorCols + aftCol;
- Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
- inputImg.copyTo(imgTotal(Rect(preCol, preLine, frameVectorCols, frameVectorRows)));
- // colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0,
- // so here colorbar has values from 255 to 0
- int colorArrSize = 256;
- Mat imgColorBar = Mat (colorArrSize, colColor, CV_8UC1 , Scalar(255,255,255));
- for (int i = 0; i < colorArrSize; ++i)
- {
- for( int j = 0; j < colColor; ++j)
- {
- imgColorBar.at<uchar>(i, j) = static_cast<uchar>(colorArrSize - 1 - i); // from 255 to 0
- }
- }
- applyColorMap(imgColorBar, imgColorBar, COLORMAP_INFERNO);
- resize(imgColorBar, imgColorBar, Size(colColor, frameVectorRows), INTER_AREA);
- imgColorBar.copyTo(imgTotal(Rect(preCol + frameVectorCols + indCol, preLine, colColor, frameVectorRows)));
- // calculating values on x axis
- if (xmax == 0)
- {
- xmax = static_cast<int>(inputAudio.size()) / samplingRate + 1;
- }
- vector<double> xList(xmarkup, 0);
- if (xmax >= xmarkup)
- {
- double deltax = (xmax - xmin) / (xmarkup - 1);
- for(int i = 0; i < xmarkup; ++i)
- {
- xList[i] = xmin + deltax * i;
- }
- }
- else
- {
- // this case is used to display a dynamic update
- vector<double> tmpXList;
- for(int i = xmin; i < xmax; ++i)
- {
- tmpXList.push_back(i + 1);
- }
- int k = 0;
- for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
- {
- xList[i] = tmpXList[k];
- k += 1;
- }
- }
- // calculating values on y axis
- // according to the Nyquist sampling theorem,
- // signal should posses frequencies equal to half of sampling rate
- int ymin = 0;
- int ymax = static_cast<int>(samplingRate / 2);
- vector<double> yList;
- double deltay = (ymax - ymin) / (ymarkup - 1);
- for(int i = 0; i < ymarkup; ++i)
- {
- yList.push_back(ymin + deltay * i);
- }
- // calculating values on z axis
- double minCv; double maxCv; Point minLoc; Point maxLoc;
- minMaxLoc( stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
- double zmin = minCv, zmax = maxCv;
- std::vector<double> zList;
- for (size_t i = 1; i < stft.size(); ++i)
- {
- minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
- zmax = max(zmax, maxCv);
- zmin = min(zmin, minCv);
- }
- double deltaz = (zmax - zmin) / (zmarkup - 1);
- for(int i = 0; i < zmarkup; ++i)
- {
- zList.push_back(zmin + deltaz * i);
- }
- // parameters for layout drawing
- int textThickness = 1;
- int gridThickness = 1;
- Scalar gridColor(0,0,0);
- Scalar textColor(0,0,0);
- float fontScale = 0.5;
- int serifSize = 10;
- int indentDownX = serifSize * 2;
- int indentDownY = serifSize / 2;
- int indentLeftX = serifSize;
- int indentLeftY = 2 * preCol / 3;
- // horizontal axis
- line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
- gridColor, gridThickness);
- // vertical axis
- line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
- gridColor, gridThickness);
- // drawing layout for x axis
- int numX = frameVectorCols / (xmarkup - 1);
- for (size_t i = 0; i < xList.size(); ++i)
- {
- int a1 = static_cast<int>(preCol + i * numX);
- int a2 = frameVectorRows + preLine;
- int b1 = a1;
- int b2 = a2 + serifSize;
- line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
- putText(imgTotal, to_string(static_cast<int>(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
- FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
- }
- // drawing layout for y axis
- int numY = frameVectorRows / (ymarkup - 1);
- for (size_t i = 0; i < yList.size(); ++i)
- {
- int a1 = preCol;
- int a2 = static_cast<int>(totalRows - aftLine - i * numY);
- int b1 = preCol - serifSize;
- int b2 = a2;
- line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
- putText(imgTotal, to_string(static_cast<int>(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
- FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
- }
- // drawing layout for z axis
- int numZ = frameVectorRows / (zmarkup - 1);
- for (size_t i = 0; i < zList.size(); ++i)
- {
- int a1 = preCol + frameVectorCols + indCol + colColor;
- int a2 = static_cast<int>(totalRows - aftLine - i * numZ);
- int b1 = a1 + serifSize;
- int b2 = a2;
- line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
- putText(imgTotal, to_string(static_cast<int>(zList[i])), Point(b1 + 10, b2 + indentDownY),
- FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
- }
- Mat resImage;
- resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
- return resImage;
- }
- Mat concatenateImages(Mat& img1, Mat& img2)
- {
- // first image will be under the second image
- int totalRows = img1.rows + img2.rows;
- int totalCols = max(img1.cols , img2.cols);
- // if images columns do not match, the difference is filled in white
- Mat imgTotal = Mat (totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
- img1.copyTo(imgTotal(Rect(0, 0, img1.cols, img1.rows)));
- img2.copyTo(imgTotal(Rect(0, img1.rows, img2.cols, img2.rows)));
- return imgTotal;
- }
- void dynamicFile(const string file)
- {
- VideoCapture cap;
- vector<int> params { CAP_PROP_AUDIO_STREAM, audioStream,
- CAP_PROP_VIDEO_STREAM, -1,
- CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
- cap.open(file, CAP_ANY, params);
- if (!cap.isOpened())
- {
- cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
- return;
- }
- const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
- const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
- int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
- cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
- cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
- cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
- cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
- int step = static_cast<int>(updateTime * samplingRate);
- int frameSize = static_cast<int>(frameSizeTime * samplingRate);
- // since the dimensional grid is counted in integer seconds,
- // if duration of audio frame is less than xmarkup, to avoid an incorrect display,
- // xmarkup will be taken equal to duration
- if (frameSizeTime <= xmarkup)
- {
- xmarkup = frameSizeTime;
- }
- vector<int> buffer;
- vector<int> frameVector;
- vector<int> section(frameSize, 0);
- vector<vector<double>>stft;
- Mat frame, imgAmplitude, imgSpec, imgTotal;
- int currentSamples = 0;
- int xmin = 0;
- int xmax = 0;
- for (;;)
- {
- if (cap.grab())
- {
- cap.retrieve(frame, audioBaseIndex);
- frameVector = frame;
- buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
- int bufferSize = static_cast<int>(buffer.size());
- if (bufferSize >= step)
- {
- currentSamples += bufferSize;
- section.erase(section.begin(), section.begin() + step);
- section.insert(section.end(), buffer.begin(), buffer.end());
- buffer.erase(buffer.begin(), buffer.begin() + step);
- if (currentSamples < frameSize)
- {
- xmin = 0;
- xmax = (currentSamples) / samplingRate;
- }
- else
- {
- xmin = (currentSamples - frameSize) / samplingRate + 1;
- xmax = (currentSamples) / samplingRate;
- }
- if (graph == "ampl")
- {
- imgAmplitude = drawAmplitude(section);
- imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
- imshow("Display amplitude graph", imgAmplitude);
- waitKey(waitTime);
- }
- else if (graph == "spec")
- {
- stft = STFT(section);
- imgSpec = drawSpectrogram(stft);
- imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
- imshow("Display spectrogram", imgSpec);
- waitKey(waitTime);
- }
- else if (graph == "ampl_and_spec")
- {
- imgAmplitude = drawAmplitude(section);
- imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
- stft = STFT(section);
- imgSpec = drawSpectrogram(stft);
- imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
- imgTotal = concatenateImages(imgAmplitude, imgSpec);
- imshow("Display amplitude graph and spectrogram", imgTotal);
- waitKey(waitTime);
- }
- }
- }
- else
- {
- break;
- }
- }
- }
- void dynamicMicrophone()
- {
- VideoCapture cap;
- vector<int> params { CAP_PROP_AUDIO_STREAM, 0,
- CAP_PROP_VIDEO_STREAM, -1 };
- cap.open(0, CAP_MSMF, params);
- if (!cap.isOpened())
- {
- cerr << "Error: Can't open microphone" << endl;
- return;
- }
- const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
- const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
- int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
- cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
- cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
- cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
- cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
- const double cvTickFreq = getTickFrequency();
- int64 sysTimeCurr = getTickCount();
- int64 sysTimePrev = sysTimeCurr;
- int step = (updateTime * samplingRate);
- int frameSize = (frameSizeTime * samplingRate);
- // since the dimensional grid is counted in integer seconds,
- // if duration of audio frame is less than xmarkup, to avoid an incorrect display,
- // xmarkup will be taken equal to duration
- if (frameSizeTime <= xmarkup)
- {
- xmarkup = frameSizeTime;
- }
- vector<int> frameVector;
- vector<int> buffer;
- vector<int> section(frameSize, 0);
- Mat frame, imgAmplitude, imgSpec, imgTotal;
- int currentSamples = 0;
- vector<vector<double>> stft;
- int xmin = 0;
- int xmax = 0;
- waitTime = updateTime * 1000;
- while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
- {
- if (cap.grab())
- {
- cap.retrieve(frame, audioBaseIndex);
- frameVector = frame;
- buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
- sysTimeCurr = getTickCount();
- int bufferSize = static_cast<int>(buffer.size());
- if (bufferSize >= step)
- {
- currentSamples += step;
- section.erase(section.begin(), section.begin() + step);
- section.insert(section.end(), buffer.begin(), buffer.end());
- buffer.erase(buffer.begin(), buffer.begin() + step);
- if (currentSamples < frameSize)
- {
- xmin = 0;
- xmax = (currentSamples) / samplingRate;
- }
- else
- {
- xmin = (currentSamples - frameSize) / samplingRate + 1;
- xmax = (currentSamples) / samplingRate;
- }
- if (graph == "ampl")
- {
- imgAmplitude = drawAmplitude(section);
- imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
- imshow("Display amplitude graph", imgAmplitude);
- waitKey(waitTime);
- }
- else if (graph == "spec")
- {
- stft = STFT(section);
- imgSpec = drawSpectrogram(stft);
- imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
- imshow("Display spectrogram", imgSpec);
- waitKey(waitTime);
- }
- else if (graph == "ampl_and_spec")
- {
- imgAmplitude = drawAmplitude(section);
- imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
- stft = STFT(section);
- imgSpec = drawSpectrogram(stft);
- imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
- imgTotal = concatenateImages(imgAmplitude, imgSpec);
- imshow("Display amplitude graph and spectrogram", imgTotal);
- waitKey(waitTime);
- }
- }
- }
- else
- {
- cerr << "Error: Grab error" << endl;
- break;
- }
- }
- }
- bool initAndCheckArgs(const CommandLineParser& parser)
- {
- inputType = parser.get<string>("inputType");
- if ((inputType != "file") && (inputType != "microphone"))
- {
- cout << "Error: " << inputType << " input method doesnt exist" << endl;
- return false;
- }
- draw = parser.get<string>("draw");
- if ((draw != "static") && (draw != "dynamic"))
- {
- cout << "Error: " << draw << " draw type doesnt exist" << endl;
- return false;
- }
- graph = parser.get<string>("graph");
- if ((graph != "ampl") && (graph != "spec") && (graph != "ampl_and_spec"))
- {
- cout << "Error: " << graph << " type of graph doesnt exist" << endl;
- return false;
- }
- audio = samples::findFile(parser.get<std::string>("audio"));
- audioStream = parser.get<int>("audioStream");
- if (audioStream < 0)
- {
- cout << "Error: audioStream = " << audioStream << " - incorrect value. Must be >= 0" << endl;
- return false;
- }
- windowType = parser.get<string>("windowType");
- if ((windowType != "Rect") && (windowType != "Hann") && (windowType != "Hamming"))
- {
- cout << "Error: " << windowType << " type of window doesnt exist" << endl;
- return false;
- }
- windLen = parser.get<int>("windLen");
- if (windLen <= 0)
- {
- cout << "Error: windLen = " << windLen << " - incorrect value. Must be > 0" << endl;
- return false;
- }
- overlap = parser.get<int>("overlap");
- if (overlap <= 0)
- {
- cout << "Error: overlap = " << overlap << " - incorrect value. Must be > 0" << endl;
- return false;
- }
- enableGrid = parser.get<bool>("enableGrid");
- rows = parser.get<int>("rows");
- if (rows <= 0)
- {
- cout << "Error: rows = " << rows << " - incorrect value. Must be > 0" << endl;
- return false;
- }
- cols = parser.get<int>("cols");
- if (cols <= 0)
- {
- cout << "Error: cols = " << cols << " - incorrect value. Must be > 0" << endl;
- return false;
- }
- xmarkup = parser.get<int>("xmarkup");
- if (xmarkup < 2)
- {
- cout << "Error: xmarkup = " << xmarkup << " - incorrect value. Must be >= 2" << endl;
- return false;
- }
- ymarkup = parser.get<int>("ymarkup");
- if (ymarkup < 2)
- {
- cout << "Error: ymarkup = " << ymarkup << " - incorrect value. Must be >= 2" << endl;
- return false;
- }
- zmarkup = parser.get<int>("zmarkup");
- if (zmarkup < 2)
- {
- cout << "Error: zmarkup = " << zmarkup << " - incorrect value. Must be >= 2" << endl;
- return false;
- }
- microTime = parser.get<int>("microTime");
- if (microTime <= 0)
- {
- cout << "Error: microTime = " << microTime << " - incorrect value. Must be > 0" << endl;
- return false;
- }
- frameSizeTime = parser.get<int>("frameSizeTime");
- if (frameSizeTime <= 0)
- {
- cout << "Error: frameSizeTime = " << frameSizeTime << " - incorrect value. Must be > 0" << endl;
- return false;
- }
- updateTime = parser.get<int>("updateTime");
- if (updateTime <= 0)
- {
- cout << "Error: updateTime = " << updateTime << " - incorrect value. Must be > 0" << endl;
- return false;
- }
- waitTime = parser.get<int>("waitTime");
- if (waitTime < 0)
- {
- cout << "Error: waitTime = " << waitTime << " - incorrect value. Must be >= 0" << endl;
- return false;
- }
- return true;
- }
- private :
- string inputType;
- string draw;
- string graph;
- string audio;
- int audioStream;
- string windowType;
- int windLen;
- int overlap;
- bool enableGrid;
- int rows;
- int cols;
- int xmarkup;
- int ymarkup;
- int zmarkup;
- int microTime;
- int frameSizeTime;
- int updateTime;
- int waitTime;
- };
- int main(int argc, char** argv)
- {
- const String keys =
- "{help h usage ? | | this sample draws a volume graph and/or spectrogram of audio/video files and microphone \n\t\tDefault usage: ./Spectrogram.exe}"
- "{inputType i | file | file or microphone }"
- "{draw d | static | type of drawing: \n\t\t\tstatic - for plotting graph(s) across the entire input audio \n\t\t\tdynamic - for plotting graph(s) in a time-updating window}"
- "{graph g | ampl_and_spec | type of graph: amplitude graph or/and spectrogram. Please use tags below : \n\t\t\tampl - draw the amplitude graph \n\t\t\tspec - draw the spectrogram\n\t\t\tampl_and_spec - draw the amplitude graph and spectrogram on one image under each other}"
- "{audio a | Megamind.avi | name and path to file }"
- "{audioStream s | 1 | CAP_PROP_AUDIO_STREAM value. Select audio stream number }"
- "{windowType t | Rect | type of window for STFT. Please use tags below : \n\t\t\tRect/Hann/Hamming }"
- "{windLen l | 256 | size of window for STFT }"
- "{overlap o | 128 | overlap of windows for STFT }"
- "{enableGrid | false | grid on the amplitude graph }"
- "{rows r | 400 | rows of output image }"
- "{cols c | 900 | cols of output image }"
- "{xmarkup x | 5 | number of x axis divisions (time asix) }"
- "{ymarkup y | 5 | number of y axis divisions (frequency or/and amplitude axis) }"
- "{zmarkup z | 5 | number of z axis divisions (colorbar) }"
- "{microTime m | 20 | time of recording audio with microphone in seconds }"
- "{frameSizeTime f| 5 | size of sliding window in seconds }"
- "{updateTime u | 1 | update time of sliding window in seconds }"
- "{waitTime w | 10 | parameter to cv.waitKey() for dynamic update of file input, takes values in milliseconds }"
- ;
- CommandLineParser parser(argc, argv, keys);
- if (parser.has("help"))
- {
- parser.printMessage();
- return 0;
- }
- AudioDrawing draw(parser);
- return 0;
- }
|