Steger.cu 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. #include "Steger.hpp"
  2. #include "time.h"
  3. #include <stdio.h>
  4. #define BLOCK_SIZE 32768
  5. #define THREAD_SIZE 512
  6. __global__ void kenel_convolve_rows_gauss(float* image, float* mask, long n, float* h, long width, long height)
  7. {
  8. long N = width*height;
  9. const int bid = blockIdx.x;
  10. const int tid = threadIdx.x;
  11. for (int i = bid*THREAD_SIZE + tid; i < N; i += BLOCK_SIZE*THREAD_SIZE)
  12. {
  13. if (i == 0)
  14. printf("in-----------------------\n");
  15. int r = i / width;
  16. int c = i % width;
  17. ///inner
  18. if (r >= n&&r < height - n)
  19. {
  20. float sum = 0.0;
  21. for (int j = -n; j <= n; j++)
  22. sum += (float)(image[i + j*width])*mask[j+n];
  23. h[i] = sum;
  24. }
  25. else
  26. {
  27. float sum = 0.0;
  28. for (int j = -n; j <= n; j++)
  29. sum += (float)(image[LINCOOR(BR(r + j), c, width)])*mask[j+n];
  30. h[i] = sum;
  31. }
  32. }
  33. }
  34. __global__ void kenel_convolve_cols_gauss(float *h,
  35. float *mask,
  36. long n,
  37. float *k,
  38. long width,
  39. long height)
  40. {
  41. long N = width*height;
  42. const int bid = blockIdx.x;
  43. const int tid = threadIdx.x;
  44. for (int i = bid*THREAD_SIZE + tid; i < N; i += BLOCK_SIZE*THREAD_SIZE)
  45. {
  46. if (i == 0)
  47. printf("cols--------------\n");
  48. int r = i / width;
  49. int c = i % width;
  50. int j = 0;
  51. float sum = 0.0;
  52. if (c>=n&&c < width - n)
  53. {
  54. sum = 0.0;
  55. for (j = -n; j <= n; j++)
  56. sum += h[i + j] * mask[j+n];
  57. k[i] = sum;
  58. }
  59. if (c < n)
  60. {
  61. sum = 0.0;
  62. for (j = -n; j <= n; j++)
  63. sum += h[LINCOOR(r, BC(c + j), width)] * mask[j+n];
  64. k[i] = sum;
  65. }
  66. if (c >= width - n&&c < width)
  67. {
  68. sum = 0.0;
  69. for (j = -n; j <= n; j++)
  70. sum += h[LINCOOR(r, BC(c + j), width)] * mask[j+n];
  71. k[i] = sum;
  72. }
  73. }
  74. }
  75. void Steger::run_gpu()
  76. {
  77. cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
  78. kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_1_r, m_l_1r, m_mid_data, m_width, m_height);
  79. kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_0_c, m_l_0c, m_k[0], m_width, m_height);
  80. cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
  81. kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_0_r, m_l_0r, m_mid_data, m_width, m_height);
  82. kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_1_c, m_l_1c, m_k[1], m_width, m_height);
  83. cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
  84. kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_2_r, m_l_2r, m_mid_data, m_width, m_height);
  85. kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_0_c, m_l_0c, m_k[2], m_width, m_height);
  86. cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
  87. kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_1_r, m_l_1r, m_mid_data, m_width, m_height);
  88. kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_1_c, m_l_1c, m_k[3], m_width, m_height);
  89. cudaMemset(m_mid_data, 0, m_width*m_height*sizeof(float));
  90. kenel_convolve_rows_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_image, m_mask_0_r, m_l_0r, m_mid_data, m_width, m_height);
  91. kenel_convolve_cols_gauss <<<BLOCK_SIZE, THREAD_SIZE, 0 >>>(m_mid_data, m_mask_2_c, m_l_2c, m_k[4], m_width, m_height);
  92. }