123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- #include "Steger.hpp"
- #include "time.h"
- #include <stdio.h>
- #define BLOCK_SIZE 32768
- #define THREAD_SIZE 512
- __global__ void kenel_convolve_rows_gauss(float* image, float* mask, long n, float* h, long width, long height)
- {
- long N = width*height;
- const int bid = blockIdx.x;
- const int tid = threadIdx.x;
- for (int i = bid*THREAD_SIZE + tid; i < N; i += BLOCK_SIZE*THREAD_SIZE)
- {
- if (i == 0)
- printf("in-----------------------\n");
- int r = i / width;
- int c = i % width;
- ///inner
- if (r >= n&&r < height - n)
- {
- float sum = 0.0;
- for (int j = -n; j <= n; j++)
- sum += (float)(image[i + j*width])*mask[j+n];
- h[i] = sum;
- }
- else
- {
- float sum = 0.0;
- for (int j = -n; j <= n; j++)
- sum += (float)(image[LINCOOR(BR(r + j), c, width)])*mask[j+n];
- h[i] = sum;
- }
- }
- }
- __global__ void kenel_convolve_cols_gauss(float *h,
- float *mask,
- long n,
- float *k,
- long width,
- long height)
- {
- long N = width*height;
- const int bid = blockIdx.x;
- const int tid = threadIdx.x;
- for (int i = bid*THREAD_SIZE + tid; i < N; i += BLOCK_SIZE*THREAD_SIZE)
- {
- if (i == 0)
- printf("cols--------------\n");
- int r = i / width;
- int c = i % width;
- int j = 0;
- float sum = 0.0;
- if (c>=n&&c < width - n)
- {
- sum = 0.0;
- for (j = -n; j <= n; j++)
- sum += h[i + j] * mask[j+n];
- k[i] = sum;
- }
- if (c < n)
- {
- sum = 0.0;
- for (j = -n; j <= n; j++)
- sum += h[LINCOOR(r, BC(c + j), width)] * mask[j+n];
- k[i] = sum;
- }
- if (c >= width - n&&c < width)
- {
- sum = 0.0;
- for (j = -n; j <= n; j++)
- sum += h[LINCOOR(r, BC(c + j), width)] * mask[j+n];
- k[i] = sum;
- }
-
- }
- }
- void Steger::run_gpu()
- {
-
- cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
- kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_1_r, m_l_1r, m_mid_data, m_width, m_height);
- kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_0_c, m_l_0c, m_k[0], m_width, m_height);
- cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
- kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_0_r, m_l_0r, m_mid_data, m_width, m_height);
- kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_1_c, m_l_1c, m_k[1], m_width, m_height);
- cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
- kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_2_r, m_l_2r, m_mid_data, m_width, m_height);
- kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_0_c, m_l_0c, m_k[2], m_width, m_height);
- cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
- kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_1_r, m_l_1r, m_mid_data, m_width, m_height);
- kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_1_c, m_l_1c, m_k[3], m_width, m_height);
- cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
- kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_0_r, m_l_0r, m_mid_data, m_width, m_height);
- kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_2_c, m_l_2c, m_k[4], m_width, m_height);
- }
|