FCam: src/processing/Demosaic.cpp Source File

00001 #include <cmath>
00002 #include <iostream>
00003 #include <algorithm>
00004 #include <map>
00005 #include <vector>
00006 #include <string.h>
00007 #ifdef FCAM_ARCH_ARM
00008 #include <arm_neon.h>
00009 #endif
00010 
00011 #include <FCam/processing/Demosaic.h>
00012 #include <FCam/Sensor.h>
00013 #include <FCam/Time.h>
00014 
00015 
00016 namespace FCam {
00017 
00018     void makeLUT(const Frame &f, float contrast, int blackLevel, float gamma, unsigned char *lut) {
00019         unsigned short minRaw = f.minRawValue()+blackLevel;
00020         unsigned short maxRaw = f.maxRawValue();
00021         
00022         for (int i = 0; i <= minRaw; i++) {
00023             lut[i] = 0;
00024         }
00025         
00026         float invRange = 1.0f/(maxRaw - minRaw);
00027         float b = 2 - powf(2.0f, contrast/100.0f);
00028         float a = 2 - 2*b; 
00029         for (int i = minRaw+1; i <= maxRaw; i++) {
00030             // Get a linear luminance in the range 0-1
00031             float y = (i-minRaw)*invRange;
00032             // Gamma correct it
00033             y = powf(y, 1.0f/gamma);
00034             // Apply a piecewise quadratic contrast curve
00035             if (y > 0.5) {
00036                 y = 1-y;
00037                 y = a*y*y + b*y;
00038                 y = 1-y;
00039             } else {
00040                 y = a*y*y + b*y;
00041             }
00042             // Convert to 8 bit and save
00043             y = std::floor(y * 255 + 0.5f);
00044             if (y < 0) y = 0;
00045             if (y > 255) y = 255;
00046             lut[i] = (unsigned char)y;
00047         }
00048         
00049         // add a guard band
00050         for (int i = maxRaw+1; i < 4096; i++) {
00051             lut[i] = 255;
00052         }
00053     }
00054 
00055     Image demosaic(Frame src, float contrast, bool denoise, int blackLevel, float gamma) {
00056         if (!src.image().valid()) {
00057             std::cerr << "Cannot demosaic an invalid image" << std::endl;
00058             Image nullImage;
00059             return nullImage;
00060         }
00061         if (src.image().bytesPerRow() % 2 == 1) {
00062             std::cerr << "Cannot demosaic an image with bytesPerRow not divisible by 2" << std::endl;
00063             Image nullImage;
00064             return nullImage;
00065         }
00066 
00067         #ifdef FCAM_ARCH_ARM
00068 
00069         /* The demosiac algorithm:  
00070            
00071         1: Load a block of input, treat it as 4-channel gr, r, b, gb
00072 
00073         1.5: Suppress hot pixels
00074         gr[HERE] = min(gr[HERE], max(gr[UP], gr[LEFT], gr[RIGHT], gr[DOWN]));
00075         r[HERE]  = min(r[HERE], max(r[UP], r[LEFT], r[RIGHT], r[DOWN]));
00076         b[HERE]  = min(b[HERE], max(b[UP], b[LEFT], b[RIGHT], b[DOWN]));
00077         gb[HERE] = min(gb[HERE], max(gb[UP], gb[LEFT], gb[RIGHT], gb[DOWN]));
00078         
00079         2: Interpolate g at r 
00080 
00081         gv_r = (gb[UP] + gb[HERE])/2;
00082         gvd_r = (gb[UP] - gb[HERE]);
00083 
00084         gh_r = (gr[HERE] + gr[RIGHT])/2;
00085         ghd_r = (gr[HERE] - gr[RIGHT]);
00086 
00087         g_r = ghd_r < gvd_r ? gh_r : gv_r;
00088         
00089         3: Interpolate g at b
00090 
00091         gv_b = (gr[DOWN] + gr[HERE])/2;
00092         gvd_b = (gr[DOWN] - gr[HERE]);
00093 
00094         gh_b = (gb[LEFT] + gb[HERE])/2;
00095         ghd_b = (gb[LEFT] - gb[HERE]);
00096 
00097         g_b = ghd_b < gvd_b ? gh_b : gv_b;
00098 
00099         4: Interpolate r at gr
00100 
00101         r_gr = (r[LEFT] + r[HERE])/2 + gr[HERE] - (g_r[LEFT] + g_r[HERE])/2;
00102 
00103         5: Interpolate b at gr
00104 
00105         b_gr = (b[UP] + b[HERE])/2 + gr[HERE] - (g_b[UP] + g_b[HERE])/2;
00106         
00107         6: Interpolate r at gb
00108         
00109         r_gb = (r[HERE] + r[DOWN])/2 + gb[HERE] - (g_r[HERE] + g_r[DOWN])/2;
00110 
00111         7: Interpolate b at gb
00112         
00113         b_gb = (b[HERE] + b[RIGHT])/2 + gb[HERE] - (g_b[HERE] + g_b[RIGHT])/2;
00114 
00115         8: Interpolate r at b
00116         
00117         rp_b = (r[DOWNLEFT] + r[HERE])/2 + g_b[HERE] - (g_r[DOWNLEFT] + g_r[HERE])/2;
00118         rn_b = (r[LEFT] + r[DOWN])/2 + g_b[HERE] - (g_r[LEFT] + g_r[DOWN])/2;
00119         rpd_b = (r[DOWNLEFT] - r[HERE]);
00120         rnd_b = (r[LEFT] - r[DOWN]);    
00121 
00122         r_b = rpd_b < rnd_b ? rp_b : rn_b;
00123 
00124         9: Interpolate b at r
00125         
00126         bp_r = (b[UPRIGHT] + b[HERE])/2 + g_r[HERE] - (g_b[UPRIGHT] + g_b[HERE])/2;
00127         bn_r = (b[RIGHT] + b[UP])/2 + g_r[HERE] - (g_b[RIGHT] + g_b[UP])/2;     
00128         bpd_r = (b[UPRIGHT] - b[HERE]);
00129         bnd_r = (b[RIGHT] - b[UP]);     
00130 
00131         b_r = bpd_r < bnd_r ? bp_r : bn_r;             
00132 
00133         10: Color matrix
00134 
00135         11: Gamma correct
00136            
00137         */
00138 
00139         #define BLOCK_WIDTH 40
00140         #define BLOCK_HEIGHT 24
00141 
00142         Image input = src.image();
00143 
00144         // Check we're the right bayer pattern. If not crop and continue.
00145         switch((int)src.bayerPattern()) {
00146         case GRBG:
00147             break;
00148         case RGGB:
00149             input = input.subImage(1, 0, Size(input.width()-2, input.height()));
00150             break;
00151         case BGGR:
00152             input = input.subImage(0, 1, Size(input.width(), input.height()-2));
00153             break;
00154         case GBRG:
00155             input = input.subImage(1, 1, Size(input.width()-2, input.height()-2));
00156         default:
00157             printf("Can't demosaic from a non-bayer sensor\n");
00158         }       
00159 
00160         int rawWidth = input.width();
00161         int rawHeight = input.height();
00162 
00163         #define VEC_WIDTH ((BLOCK_WIDTH + 8)/8)
00164         #define VEC_HEIGHT ((BLOCK_HEIGHT + 8)/2)       
00165 
00166         int rawPixelsPerRow = input.bytesPerRow()/2 ; // Assumes bytesPerRow is even
00167 
00168         int outWidth = rawWidth-8;
00169         int outHeight = rawHeight-8;
00170         outWidth /= BLOCK_WIDTH;
00171         outWidth *= BLOCK_WIDTH;
00172         outHeight /= BLOCK_HEIGHT;
00173         outHeight *= BLOCK_HEIGHT;
00174 
00175         Image out(outWidth, outHeight, RGB24);
00176                 
00177         // Check we're the right size, if not, crop center
00178         if (((input.width() - 8) != (unsigned)outWidth) ||
00179             ((input.height() - 8) != (unsigned)outHeight)) { 
00180             int offX = (input.width() - 8 - outWidth)/2;
00181             int offY = (input.height() - 8 - outHeight)/2;
00182             offX -= offX&1;
00183             offY -= offY&1;
00184             
00185             if (offX || offY) {
00186                 input = input.subImage(offX, offY, Size(outWidth+8, outHeight+8));
00187             }
00188         }           
00189         
00190         Time startTime = Time::now(); 
00191 
00192         // Prepare the color matrix in S8.8 fixed point
00193         float colorMatrix_f[12];
00194         
00195         src.rawToRGBColorMatrix((float *)colorMatrix_f);
00196 
00197         int16x4_t colorMatrix[3];
00198         for (int i = 0; i < 3; i++) {
00199             int16_t val = (int16_t)(colorMatrix_f[i*4+0] * 256 + 0.5);
00200             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 0);
00201             val = (int16_t)(colorMatrix_f[i*4+1] * 256 + 0.5);
00202             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 1);
00203             val = (int16_t)(colorMatrix_f[i*4+2] * 256 + 0.5);
00204             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 2);
00205             val = (int16_t)(colorMatrix_f[i*4+3] * 256 + 0.5);
00206             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 3);
00207         }
00208 
00209         // A buffer to store data after demosiac and color correction
00210         // but before gamma correction
00211         uint16_t out16[BLOCK_WIDTH*BLOCK_HEIGHT*3];
00212 
00213         // Various color channels. Only 4 of them are defined before
00214         // demosaic, all of them are defined after demosiac
00215         int16_t scratch[VEC_WIDTH*VEC_HEIGHT*4*12];
00216 
00217         #define R_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*0)
00218         #define R_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*1)
00219         #define R_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*2)
00220         #define R_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*3)
00221 
00222         #define G_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*4)
00223         #define G_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*5)
00224         #define G_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*6)
00225         #define G_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*7)
00226 
00227         #define B_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*8)
00228         #define B_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*9)
00229         #define B_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*10)
00230         #define B_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*11)
00231 
00232         #define R_R(i)  (scratch+(i)+R_R_OFF)
00233         #define R_GR(i) (scratch+(i)+R_GR_OFF)
00234         #define R_GB(i) (scratch+(i)+R_GB_OFF)
00235         #define R_B(i)  (scratch+(i)+R_B_OFF)
00236 
00237         #define G_R(i)  (scratch+(i)+G_R_OFF)
00238         #define G_GR(i) (scratch+(i)+G_GR_OFF)
00239         #define G_GB(i) (scratch+(i)+G_GB_OFF)
00240         #define G_B(i)  (scratch+(i)+G_B_OFF)
00241 
00242         #define B_R(i)  (scratch+(i)+B_R_OFF)
00243         #define B_GR(i) (scratch+(i)+B_GR_OFF)
00244         #define B_GB(i) (scratch+(i)+B_GB_OFF)
00245         #define B_B(i)  (scratch+(i)+B_B_OFF)
00246 
00247         // Reuse some of the output scratch area for the noisy inputs
00248         #define G_GR_NOISY B_GR
00249         #define B_B_NOISY  G_B
00250         #define R_R_NOISY  G_R
00251         #define G_GB_NOISY B_GB
00252 
00253         // Prepare the lookup table
00254         unsigned char lut[4096];
00255         makeLUT(src, contrast, blackLevel, gamma, lut);
00256 
00257         // For each block in the input
00258         for (int by = 0; by < rawHeight-8-BLOCK_HEIGHT+1; by += BLOCK_HEIGHT) {
00259             const short * __restrict__ blockPtr = (const short *)input(0,by);
00260             unsigned char * __restrict__ outBlockPtr = out(0, by);
00261             for (int bx = 0; bx < rawWidth-8-BLOCK_WIDTH+1; bx += BLOCK_WIDTH) {                
00262 
00263 
00264                 // Stage 1) Demux a block of input into L1
00265                 if (1) {
00266                     register const int16_t * __restrict__ rawPtr = blockPtr;
00267                     register const int16_t * __restrict__ rawPtr2 = blockPtr + rawPixelsPerRow;
00268 
00269                     register const int rawJump = rawPixelsPerRow*2 - VEC_WIDTH*8;
00270 
00271                     register int16_t * __restrict__ g_gr_ptr = denoise ? G_GR_NOISY(0) : G_GR(0);
00272                     register int16_t * __restrict__ r_r_ptr  = denoise ? R_R_NOISY(0)  : R_R(0);
00273                     register int16_t * __restrict__ b_b_ptr  = denoise ? B_B_NOISY(0)  : B_B(0);
00274                     register int16_t * __restrict__ g_gb_ptr = denoise ? G_GB_NOISY(0) : G_GB(0);
00275 
00276                     for (int y = 0; y < VEC_HEIGHT; y++) {
00277                         for (int x = 0; x < VEC_WIDTH/2; x++) {
00278 
00279                             asm volatile ("# Stage 1) Demux\n");
00280                             
00281                             // The below needs to be volatile, but
00282                             // it's not clear why (if it's not, it
00283                             // gets optimized out entirely)
00284                             asm volatile (
00285                                  "vld2.16  {d6-d9}, [%[rawPtr]]! \n\t"
00286                                  "vld2.16  {d10-d13}, [%[rawPtr2]]! \n\t"
00287                                  "vst1.16  {d6-d7}, [%[g_gr_ptr]]! \n\t"
00288                                  "vst1.16  {d8-d9}, [%[r_r_ptr]]! \n\t"
00289                                  "vst1.16  {d10-d11}, [%[b_b_ptr]]! \n\t"
00290                                  "vst1.16  {d12-d13}, [%[g_gb_ptr]]! \n\t" :
00291                                  [rawPtr]"+r"(rawPtr), 
00292                                  [rawPtr2]"+r"(rawPtr2),
00293                                  [g_gr_ptr]"+r"(g_gr_ptr),
00294                                  [r_r_ptr]"+r"(r_r_ptr),
00295                                  [b_b_ptr]"+r"(b_b_ptr),
00296                                  [g_gb_ptr]"+r"(g_gb_ptr) ::
00297                                  "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory");
00298                             
00299                         }
00300 
00301                         rawPtr += rawJump;
00302                         rawPtr2 += rawJump;
00303                     }               
00304                 }
00305 
00306                 // Stage 1.5) Denoise sensor input (noisy pixel supression)
00307 
00308                 // A pixel can't be brighter than its brightest neighbor
00309 
00310                 if (denoise) {
00311                     register int16_t * __restrict__ ptr_in = NULL;
00312                     register int16_t * __restrict__ ptr_out = NULL;
00313                     asm volatile("#Stage 1.5: Denoise\n\t");
00314                     for (int b=0; b<4; b++) {
00315                         if (b==0) ptr_in = G_GR_NOISY(0);
00316                         if (b==1) ptr_in = R_R_NOISY(0);
00317                         if (b==2) ptr_in = B_B_NOISY(0);
00318                         if (b==3) ptr_in = G_GB_NOISY(0);
00319                         if (b==0) ptr_out = G_GR(0);
00320                         if (b==1) ptr_out = R_R(0);
00321                         if (b==2) ptr_out = B_B(0);
00322                         if (b==3) ptr_out = G_GB(0);
00323 
00324                         // write the top block pixels who aren't being denoised
00325                         for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00326                             int16x8_t in = vld1q_s16(ptr_in);
00327                             vst1q_s16(ptr_out, in);
00328                             ptr_in+=8;
00329                             ptr_out+=8;
00330                         }
00331 
00332                         for (int y = 1; y < VEC_HEIGHT - 1; y++) {
00333                             for (int x = 0; x < VEC_WIDTH/2; x++) {
00334                                 int16x8_t here  = vld1q_s16(ptr_in);
00335                                 int16x8_t above = vld1q_s16(ptr_in + VEC_WIDTH*4);
00336                                 int16x8_t under = vld1q_s16(ptr_in - VEC_WIDTH*4);
00337                                 int16x8_t right = vld1q_s16(ptr_in + 1);
00338                                 int16x8_t left  = vld1q_s16(ptr_in - 1);
00339                                 int16x8_t max, min;
00340 
00341                                 // find the max and min of the neighbors
00342                                 max = vmaxq_s16(left, right);
00343                                 max = vmaxq_s16(above, max);
00344                                 max = vmaxq_s16(under, max);
00345 
00346                                 min = vminq_s16(left, right);
00347                                 min = vminq_s16(above, min);
00348                                 min = vminq_s16(under, min);                               
00349 
00350                                 // clamp here to be within this range
00351                                 here  = vminq_s16(max, here);
00352                                 here  = vmaxq_s16(min, here);
00353 
00354                                 vst1q_s16(ptr_out, here);
00355                                 ptr_in += 8;
00356                                 ptr_out += 8;
00357                             }
00358                         }
00359 
00360                         // write the bottom block pixels who aren't being denoised
00361                         for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00362                             int16x8_t in = vld1q_s16(ptr_in);
00363                             vst1q_s16(ptr_out, in);
00364                             ptr_in+=8;
00365                             ptr_out+=8;
00366                         }
00367                     }
00368                 }
00369 
00370                 // Stage 2 and 3) Do horizontal and vertical
00371                 // interpolation of green, as well as picking the
00372                 // output for green
00373                 /*
00374                   gv_r = (gb[UP] + gb[HERE])/2;
00375                   gvd_r = (gb[UP] - gb[HERE]);
00376                   gh_r = (gr[HERE] + gr[RIGHT])/2;
00377                   ghd_r = (gr[HERE] - gr[RIGHT]);                 
00378                   g_r = ghd_r < gvd_r ? gh_r : gv_r;
00379                   
00380                   gv_b = (gr[DOWN] + gr[HERE])/2;
00381                   gvd_b = (gr[DOWN] - gr[HERE]);                  
00382                   gh_b = (gb[LEFT] + gb[HERE])/2;
00383                   ghd_b = (gb[LEFT] - gb[HERE]);                  
00384                   g_b = ghd_b < gvd_b ? gh_b : gv_b;
00385                 */
00386                 if (1) {
00387                 
00388                     int i = VEC_WIDTH*4;
00389 
00390                     register int16_t *g_gb_up_ptr = G_GB(i) - VEC_WIDTH*4;
00391                     register int16_t *g_gb_here_ptr = G_GB(i);
00392                     register int16_t *g_gb_left_ptr = G_GB(i) - 1;
00393                     register int16_t *g_gr_down_ptr = G_GR(i) + VEC_WIDTH*4;
00394                     register int16_t *g_gr_here_ptr = G_GR(i);
00395                     register int16_t *g_gr_right_ptr = G_GR(i) + 1;
00396                     register int16_t *g_r_ptr = G_R(i);
00397                     register int16_t *g_b_ptr = G_B(i);
00398             
00399                     for (int y = 1; y < VEC_HEIGHT-1; y++) {
00400                         for (int x = 0; x < VEC_WIDTH/2; x++) {
00401 
00402                             asm volatile ("#Stage 2) Green interpolation\n");
00403 
00404                             // Load the inputs
00405 
00406                             int16x8_t gb_up = vld1q_s16(g_gb_up_ptr);
00407                             g_gb_up_ptr+=8;
00408                             int16x8_t gb_here = vld1q_s16(g_gb_here_ptr);
00409                             g_gb_here_ptr+=8;
00410                             int16x8_t gb_left = vld1q_s16(g_gb_left_ptr); // unaligned
00411                             g_gb_left_ptr+=8;
00412                             int16x8_t gr_down = vld1q_s16(g_gr_down_ptr);
00413                             g_gr_down_ptr+=8;
00414                             int16x8_t gr_here = vld1q_s16(g_gr_here_ptr);
00415                             g_gr_here_ptr+=8;
00416                             int16x8_t gr_right = vld1q_s16(g_gr_right_ptr); // unaligned
00417                             g_gr_right_ptr+=8;
00418                             
00419                             //I couldn't get this assembly to work, and I don't know which
00420                             // of the three blocks of assembly is wrong
00421                             // This asm should load the inputs
00422                             /*
00423                             asm volatile(
00424                             "vld1.16        {d16-d17}, [%[gb_up]]!\n\t"
00425                             "vld1.16        {d18-d19}, [%[gb_here]]!\n\t"
00426                             "vld1.16        {d20-d21}, [%[gb_left]]!\n\t"
00427                             "vld1.16        {d22-d23}, [%[gr_down]]!\n\t"
00428                             "vld1.16        {d24-d25}, [%[gr_here]]!\n\t"
00429                             "vld1.16        {d26-d27}, [%[gr_right]]!\n\t"
00430                             :
00431                             [gb_up]"+r"(g_gb_up_ptr),
00432                             [gb_here]"+r"(g_gb_here_ptr),
00433                             [gb_left]"+r"(g_gb_left_ptr),
00434                             [gr_down]"+r"(g_gr_down_ptr),
00435                             [gr_here]"+r"(g_gr_here_ptr),
00436                             [gr_right]"+r"(g_gr_right_ptr) :: 
00437                             //"d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
00438                             "q8","q9","q10","q11","q12","q13");
00439 
00440                             //q8 - gb_up
00441                             //q9 - gb_here
00442                             //q10 - gb_left
00443                             //q11 - gr_down
00444                             //q12 - gr_here
00445                             //q13 - gr_right
00446                             */
00447 
00448                             // Do the processing
00449                             int16x8_t gv_r  = vhaddq_s16(gb_up, gb_here);
00450                             int16x8_t gvd_r = vabdq_s16(gb_up, gb_here);
00451                             int16x8_t gh_r  = vhaddq_s16(gr_right, gr_here);
00452                             int16x8_t ghd_r = vabdq_s16(gr_here, gr_right);
00453                             int16x8_t g_r = vbslq_s16(vcltq_s16(ghd_r, gvd_r), gh_r, gv_r);
00454 
00455                             int16x8_t gv_b  = vhaddq_s16(gr_down, gr_here);
00456                             int16x8_t gvd_b = vabdq_s16(gr_down, gr_here);
00457                             int16x8_t gh_b  = vhaddq_s16(gb_left, gb_here);
00458                             int16x8_t ghd_b = vabdq_s16(gb_left, gb_here);
00459                             int16x8_t g_b = vbslq_s16(vcltq_s16(ghd_b, gvd_b), gh_b, gv_b);
00460                             
00461                             //this asm should do the above selection/interpolation
00462                             /*
00463                             asm volatile(
00464                             "vabd.s16       q0, q12, q13\n\t" //ghd_r
00465                             "vabd.s16       q1, q8, q9\n\t" //gvd_r
00466                             "vabd.s16       q2, q10, q9\n\t" //ghd_b
00467                             "vabd.s16       q3, q11, q12\n\t" //gvd_b
00468                             "vcgt.s16       q1, q0, q1\n\t" //select ghd_r or gvd_r
00469                             "vcgt.s16       q2, q2, q3\n\t" //select gvd_b or ghd_b
00470                             "vhadd.s16      q8, q8, q9\n\t" //gv_r
00471                             "vhadd.s16      q11, q11, q12\n\t" //gv_b
00472                             "vhadd.s16      q12, q12, q13\n\t" //gh_r
00473                             "vhadd.s16      q9, q9, q10\n\t" //gh_b
00474                             "vbsl           q1, q12, q8\n\t" //g_r
00475                             "vbsl           q2, q9, q11\n\t" //g_b
00476                              ::: "q0","q1","q2","q3","q8","q9","q10","q11","q12","q13");
00477                             */
00478 
00479                             //this should save the output
00480                             /*
00481                             asm volatile(
00482                             "vst1.16        {d2-d3}, [%[g_r]]!\n\t"
00483                             "vst1.16        {d4-d5}, [%[g_b]]!\n\t" :
00484                             [g_r]"+r"(g_r_ptr),[g_b]"+r"(g_b_ptr)
00485                             :: "memory");
00486                             */
00487                             
00488                             // Save the outputs
00489                             vst1q_s16(g_r_ptr, g_r);
00490                             g_r_ptr+=8;
00491                             vst1q_s16(g_b_ptr, g_b);
00492                             g_b_ptr+=8;
00493                         }
00494                     }
00495                 }
00496                 asm volatile ("#End of stage 2 (green interpolation)\n");
00497                 // Stages 4-9
00498 
00499                 if (1) {
00500                     
00501                     /*
00502                       r_gr = (r[LEFT] + r[HERE])/2 + gr[HERE] - (g_r[LEFT] + g_r[HERE])/2;
00503                       b_gr = (b[UP] + b[HERE])/2 + gr[HERE] - (g_b[UP] + g_b[HERE])/2;
00504                       r_gb = (r[HERE] + r[DOWN])/2 + gb[HERE] - (g_r[HERE] + g_r[DOWN])/2;
00505                       b_gb = (b[HERE] + b[RIGHT])/2 + gb[HERE] - (g_b[HERE] + g_b[RIGHT])/2;
00506                       
00507                       rp_b = (r[DOWNLEFT] + r[HERE])/2 + g_b[HERE] - (g_r[DOWNLEFT] + g_r[HERE])/2;
00508                       rn_b = (r[LEFT] + r[DOWN])/2 + g_b[HERE] - (g_r[LEFT] + g_r[DOWN])/2;
00509                       rpd_b = (r[DOWNLEFT] - r[HERE]);
00510                       rnd_b = (r[LEFT] - r[DOWN]);      
00511                       r_b = rpd_b < rnd_b ? rp_b : rn_b;                      
00512         
00513                       bp_r = (b[UPRIGHT] + b[HERE])/2 + g_r[HERE] - (g_b[UPRIGHT] + g_b[HERE])/2;
00514                       bn_r = (b[RIGHT] + b[UP])/2 + g_r[HERE] - (g_b[RIGHT] + g_b[UP])/2;       
00515                       bpd_r = (b[UPRIGHT] - b[HERE]);
00516                       bnd_r = (b[RIGHT] - b[UP]);       
00517                       b_r = bpd_r < bnd_r ? bp_r : bn_r;
00518                     */
00519 
00520                     int i = 2*VEC_WIDTH*4;
00521 
00522                     for (int y = 2; y < VEC_HEIGHT-2; y++) {
00523                         for (int x = 0; x < VEC_WIDTH; x++) {
00524 
00525                             asm volatile ("#Stage 4) r/b interpolation\n");
00526 
00527                             // Load the inputs
00528                             int16x4_t r_here       = vld1_s16(R_R(i));
00529                             int16x4_t r_left       = vld1_s16(R_R(i) - 1);
00530                             int16x4_t r_down       = vld1_s16(R_R(i) + VEC_WIDTH*4);
00531 
00532                             int16x4_t g_r_left     = vld1_s16(G_R(i) - 1);
00533                             int16x4_t g_r_here     = vld1_s16(G_R(i));
00534                             int16x4_t g_r_down     = vld1_s16(G_R(i) + VEC_WIDTH*4);
00535 
00536                             int16x4_t b_up         = vld1_s16(B_B(i) - VEC_WIDTH*4);
00537                             int16x4_t b_here       = vld1_s16(B_B(i));
00538                             int16x4_t b_right      = vld1_s16(B_B(i) + 1);
00539 
00540                             int16x4_t g_b_up       = vld1_s16(G_B(i) - VEC_WIDTH*4);
00541                             int16x4_t g_b_here     = vld1_s16(G_B(i));
00542                             int16x4_t g_b_right    = vld1_s16(G_B(i) + 1);
00543 
00544                             // Do the processing
00545                             int16x4_t gr_here      = vld1_s16(G_GR(i));
00546                             int16x4_t gb_here      = vld1_s16(G_GB(i));
00547 
00548                             { // red at green
00549                                 int16x4_t r_gr  = vadd_s16(vhadd_s16(r_left, r_here),
00550                                                             vsub_s16(gr_here,
00551                                                                       vhadd_s16(g_r_left, g_r_here)));
00552                                 int16x4_t r_gb  = vadd_s16(vhadd_s16(r_here, r_down),
00553                                                             vsub_s16(gb_here, 
00554                                                                       vhadd_s16(g_r_down, g_r_here)));
00555                                 vst1_s16(R_GR(i), r_gr);
00556                                 vst1_s16(R_GB(i), r_gb);
00557                             }
00558                             
00559                             { // red at blue
00560                                 int16x4_t r_downleft   = vld1_s16(R_R(i) + VEC_WIDTH*4 - 1);
00561                                 int16x4_t g_r_downleft = vld1_s16(G_R(i) + VEC_WIDTH*4 - 1);
00562                                 
00563                                 int16x4_t rp_b  = vadd_s16(vhadd_s16(r_downleft, r_here),
00564                                                             vsub_s16(g_b_here,
00565                                                                      vhadd_s16(g_r_downleft, g_r_here)));
00566                                 int16x4_t rn_b  = vadd_s16(vhadd_s16(r_left, r_down),
00567                                                             vsub_s16(g_b_here,
00568                                                                      vhadd_s16(g_r_left, g_r_down)));
00569                                 int16x4_t rpd_b = vabd_s16(r_downleft, r_here);
00570                                 int16x4_t rnd_b = vabd_s16(r_left, r_down);
00571                                 int16x4_t r_b   = vbsl_s16(vclt_s16(rpd_b, rnd_b), rp_b, rn_b);
00572                                 vst1_s16(R_B(i), r_b);
00573                             }
00574                             
00575                             { // blue at green
00576                                 int16x4_t b_gr  = vadd_s16(vhadd_s16(b_up, b_here),
00577                                                             vsub_s16(gr_here,
00578                                                                      vhadd_s16(g_b_up, g_b_here)));
00579                                 int16x4_t b_gb  = vadd_s16(vhadd_s16(b_here, b_right),
00580                                                             vsub_s16(gb_here,
00581                                                                      vhadd_s16(g_b_right, g_b_here)));
00582                                 vst1_s16(B_GR(i), b_gr);
00583                                 vst1_s16(B_GB(i), b_gb);
00584                             }
00585                             
00586                             { // blue at red
00587                                 int16x4_t b_upright    = vld1_s16(B_B(i) - VEC_WIDTH*4 + 1);
00588                                 int16x4_t g_b_upright  = vld1_s16(G_B(i) - VEC_WIDTH*4 + 1);
00589                                 
00590                                 int16x4_t bp_r  = vadd_s16(vhadd_s16(b_upright, b_here),
00591                                                             vsub_s16(g_r_here, 
00592                                                                       vhadd_s16(g_b_upright, g_b_here)));
00593                                 int16x4_t bn_r  = vadd_s16(vhadd_s16(b_right, b_up),
00594                                                             vsub_s16(g_r_here,
00595                                                                       vhadd_s16(g_b_right, g_b_up)));
00596                                 int16x4_t bpd_r = vabd_s16(b_upright, b_here);
00597                                 int16x4_t bnd_r = vabd_s16(b_right, b_up);
00598                                 int16x4_t b_r   = vbsl_s16(vclt_s16(bpd_r, bnd_r), bp_r, bn_r);
00599                                 vst1_s16(B_R(i), b_r);
00600                             }
00601                             
00602                             // Advance the index
00603                             i += 4;
00604                         }
00605                     }
00606                     asm volatile ("#End of stage 4 - what_ever\n\t");
00607                 }
00608 
00609                 // Stage 10)
00610                 if (1) {
00611                     // Color-correct and save the results into a 16-bit buffer for gamma correction
00612 
00613                     asm volatile ("#Stage 10) Color Correction\n");
00614 
00615                     uint16_t * __restrict__ out16Ptr = out16;
00616 
00617                     int i = 2*VEC_WIDTH*4;
00618 
00619                     for (int y = 2; y < VEC_HEIGHT-2; y++) {
00620 
00621                         // skip the first vec in each row
00622                         
00623                         int16x4x2_t r0 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00624                         int16x4x2_t g0 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00625                         int16x4x2_t b0 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00626                         i += 4;
00627 
00628                         for (int x = 1; x < VEC_WIDTH; x++) {
00629                             
00630                             int16x4x2_t r1 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00631                             int16x4x2_t g1 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00632                             int16x4x2_t b1 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00633                             
00634                             // do the color matrix
00635                             int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                       
00636                             rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00637                             rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00638                             rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00639                             
00640                             int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                       
00641                             gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00642                             gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00643                             gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00644                             
00645                             int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00646                             bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00647                             bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00648                             bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00649                             
00650                             uint16x4x3_t col16;
00651                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00652                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00653                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00654                             vst3_u16(out16Ptr, col16);
00655                             out16Ptr += 12;
00656                             
00657                             rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                 
00658                             rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00659                             rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00660                             rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00661                             
00662                             gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                 
00663                             gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00664                             gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00665                             gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00666                             
00667                             bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00668                             bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00669                             bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00670                             bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00671                             
00672                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00673                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00674                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00675                             vst3_u16(out16Ptr, col16);
00676                             out16Ptr += 12;
00677                             
00678                             r0 = r1;
00679                             g0 = g1;
00680                             b0 = b1;
00681 
00682                             i += 4;
00683                         }
00684 
00685                         // jump back
00686                         i -= VEC_WIDTH*4;
00687 
00688                         r0 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00689                         g0 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00690                         b0 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));                     
00691                         i += 4;
00692 
00693                         for (int x = 1; x < VEC_WIDTH; x++) {
00694                             int16x4x2_t r1 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00695                             int16x4x2_t g1 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00696                             int16x4x2_t b1 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00697                             
00698                             // do the color matrix
00699                             int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                       
00700                             rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00701                             rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00702                             rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00703                             
00704                             int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                       
00705                             gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00706                             gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00707                             gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00708                             
00709                             int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00710                             bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00711                             bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00712                             bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00713                             
00714                             uint16x4x3_t col16;
00715                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00716                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00717                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00718                             vst3_u16(out16Ptr, col16);
00719                             out16Ptr += 12;
00720                             
00721                             rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                 
00722                             rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00723                             rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00724                             rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00725                             
00726                             gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                 
00727                             gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00728                             gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00729                             gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00730                             
00731                             bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00732                             bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00733                             bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00734                             bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00735                             
00736                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00737                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00738                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00739                             vst3_u16(out16Ptr, col16);
00740                             out16Ptr += 12;
00741                             
00742                             r0 = r1;
00743                             g0 = g1;
00744                             b0 = b1;
00745 
00746                             i += 4;
00747                         }
00748                     }   
00749                     asm volatile("#End of stage 10) - color correction\n\t");
00750                 }
00751                 
00752 
00753                 if (1) {
00754 
00755                     asm volatile("#Gamma Correction\n");                   
00756                     // Gamma correction (on the CPU, not the NEON)
00757                     const uint16_t * __restrict__ out16Ptr = out16;
00758                     
00759                     for (int y = 0; y < BLOCK_HEIGHT; y++) {                    
00760                         unsigned int * __restrict__ outPtr32 = (unsigned int *)(outBlockPtr + y * outWidth * 3);
00761                         for (int x = 0; x < (BLOCK_WIDTH*3)/4; x++) {
00762                             unsigned val = ((lut[out16Ptr[0]] << 0) |
00763                                             (lut[out16Ptr[1]] << 8) | 
00764                                             (lut[out16Ptr[2]] << 16) |
00765                                             (lut[out16Ptr[3]] << 24));
00766                             *outPtr32++ = val;
00767                             out16Ptr += 4;
00768                             // *outPtr++ = lut[*out16Ptr++];
00769                         }
00770                     }           
00771                     asm volatile("#end of Gamma Correction\n");                   
00772                     
00773                     /*
00774                     const uint16_t * __restrict__ out16Ptr = out16;                 
00775                     for (int y = 0; y < BLOCK_HEIGHT; y++) {                    
00776                         unsigned char * __restrict__ outPtr = (outBlockPtr + y * outWidth * 3);
00777                         for (int x = 0; x < (BLOCK_WIDTH*3); x++) {
00778                             *outPtr++ = lut[*out16Ptr++];
00779                         }
00780                     }
00781                     */
00782                     
00783                 }
00784                 
00785 
00786                 blockPtr += BLOCK_WIDTH;
00787                 outBlockPtr += BLOCK_WIDTH*3;
00788             }
00789         }       
00790 
00791         //std::cout << "Done demosaicking. time = " << ((Time::now() - startTime)/1000) << std::endl;
00792     #else
00793     Image out;
00794         #endif
00795     
00796         return out;
00797     }
00798 
00799     #ifdef __arm__
00800     Image makeThumbnailRAW_N900(Frame src, float contrast, int blackLevel, float gamma) {
00801         // Assuming we want a slightly-cropped thumbnail into 640x480, Bayer pattern GRBG
00802         // This means averaging together a 4x4 block of Bayer pattern for one RGB24 pixel
00803         // Also want to convert to sRGB, which includes a color matrix multiply and a gamma transform
00804         // using a lookup table.
00805 
00806         // Implementation: 
00807         //   Uses ARM NEON SIMD vector instructions and inline assembly.
00808         //   Reads in a 16x4 block of pixels at a time, in 16-bit GRBG Bayer format, and outputs a 4x1 block of RGB24 pixels.
00809         // Important note: Due to some apparent bugs in GCC's inline assembly register mapping between C variables and NEON registers,
00810         //   namely that trying to reference an int16x4 variable creates a reference to a s register instead of a d register, all the
00811         //   int16x4 variables are forced into specific NEON registers, and then referred to using that register, not by name.  
00812         //   This bug seems to be in gcc 4.2.1, should be fixed by 4.4 based on some gcc bug reports.
00813 
00814         Image thumb(640, 480, RGB24);
00815         const unsigned int w = 2592, tw = 640;
00816         const unsigned int h = 1968, th = 480;
00817         const unsigned int scale = 4;
00818         const unsigned int cw = tw*scale;
00819         const unsigned int ch = th*scale;
00820         const unsigned int startX = (w-cw)/2;
00821         const unsigned int startY = (h-ch)/2;        
00822         const unsigned int bytesPerRow = src.image().bytesPerRow();
00823 
00824         // Make the response curve
00825         unsigned char lut[4096];
00826         makeLUT(src, contrast, blackLevel, gamma, lut);
00827 
00828         unsigned char *row = src.image()(startX, startY);
00829 
00830         Time startTime = Time::now();
00831         float colorMatrix_f[12];
00832         src.rawToRGBColorMatrix(colorMatrix_f);
00833 
00834         register int16x4_t colorMatrix0 asm ("d0"); // ASM assignments are essential - they're implicitly trusted by the inline code.
00835         register int16x4_t colorMatrix1 asm ("d1");
00836         register int16x4_t colorMatrix2 asm ("d2");
00837         register int16x4_t wCoord asm ("d20"); // Workaround for annoyances with scalar addition.
00838         register int16x4_t maxValue asm ("d21"); // Maximum allowed signed 16-bit value
00839         register int16x4_t minValue asm ("d22"); // Minimum allowed signed 16-bit value
00840 
00841         asm volatile(
00842                     // Load matrix into colorMatrix0-2, set to be d0-d2
00843                     "vldm %[colorMatrix_f], {q2,q3,q4}  \n\t"
00844                     "vcvt.s32.f32 q2, q2, #8  \n\t" // Float->fixed-point conversion
00845                     "vcvt.s32.f32 q3, q3, #8  \n\t"
00846                     "vcvt.s32.f32 q4, q4, #8  \n\t"
00847                     "vmovn.i32 d0, q2  \n\t" // Narrowing to 16-bit
00848                     "vmovn.i32 d1, q3  \n\t"
00849                     "vmovn.i32 d2, q4  \n\t"
00850                     // Load homogenous coordinate, pixel value limits
00851                     "vmov.i16  d20, #0x4   \n\t"  // Homogenous coordinate. 
00852                     "vmov.i16  d21, #0x00FF  \n\t"  // Maximum pixel value: 1023
00853                     "vorr.i16  d21, #0x0300  \n\t"  // Maximum pixel value part 2
00854                     "vmov.i16  d22, #0x0     \n\t"  // Minimum pixel value: 0
00855                     : [colorMatrix0] "=w" (colorMatrix0),
00856                       [colorMatrix1] "=w" (colorMatrix1),
00857                       [colorMatrix2] "=w" (colorMatrix2),
00858                       [wCoord] "=w" (wCoord),
00859                       [maxValue] "=w" (maxValue),
00860                       [minValue] "=w" (minValue)
00861                     :  [colorMatrix_f] "r" (colorMatrix_f)
00862                     : "memory",
00863                       "d3", "d4", "d5", "d6", "d7", "d8", "d9");
00864                 
00865         for (unsigned int ty = 0; ty <480; ty++, row+=4*bytesPerRow) {
00866             register unsigned short *px0 = (unsigned short *)row;
00867             register unsigned short *px1 = (unsigned short *)(row+1*bytesPerRow);
00868             register unsigned short *px2 = (unsigned short *)(row+2*bytesPerRow);
00869             register unsigned short *px3 = (unsigned short *)(row+3*bytesPerRow);
00870 
00871             register unsigned char *dst = thumb(0,ty);
00872             for (register unsigned int tx =0; tx < 640; tx+=scale) {
00873                 // Assembly block for fast downsample/demosaic, color correction, and gamma curve lookup
00874                 asm volatile(
00875                     // *px0: GRGR GRGR GRGR GRGR
00876                     // *px1: BGBG BGBG BGBG BGBG
00877                     // *px2: GRGR GRGR GRGR GRGR
00878                     // *px3: BGBG BGBG BGBG BGBG
00880                     "vld2.16 {d4-d7}, [%[px0]]!  \n\t"
00881                     "vld2.16 {d8-d11}, [%[px1]]! \n\t"
00882                     "vld2.16 {d12-d15}, [%[px2]]! \n\t"
00883                     "vld2.16 {d16-d19}, [%[px3]]! \n\t"
00884                     //  d4    d5    d6    d7
00885                     // GG|GG GG|GG RR|RR RR|RR
00886                     //  d8    d9    d10   d11
00887                     // BB|BB BB|BB GG|GG GG|GG
00888                     //  d12   d13   d14   d15
00889                     // GG|GG GG|GG RR|RR RR|RR
00890                     //  d16   d17   d18   d19
00891                     // BB|BB BB|BB GG|GG GG|GG
00892 
00894                     "vpadd.u16 d4, d4, d5  \n\t"   // G1
00895                     "vpadd.u16 d5, d6, d7  \n\t"   // R1
00896                     "vpadd.u16 d6, d8, d9  \n\t"   // B1
00897                     "vpadd.u16 d7, d10, d11 \n\t"  // G2
00898                     "vpadd.u16 d8, d12, d13 \n\t"  // G3
00899                     "vpadd.u16 d9, d14, d15 \n\t"  // R2
00900                     "vpadd.u16 d10, d16, d17 \n\t" // B2
00901                     "vpadd.u16 d11, d18, d19 \n\t" // G4
00902                     //    d4       d5       d6       d7
00903                     // G|G|G|G  R|R|R|R  B|B|B|B  G|G|G|G
00904                     //    d8       d9       d10      d11
00905                     // G|G|G|G  R|R|R|R  B|B|B|B  G|G|G|G
00906 
00908                     "vadd.u16 d7, d8   \n\t"
00909                     "vadd.u16 d4, d11   \n\t"
00910                     "vhadd.u16 d4, d7  \n\t"
00912                     "vadd.u16 d5, d9  \n\t"
00914                     "vadd.u16 d6, d10 \n\t"
00915                     //    d4       d5       d6  
00916                     // G|G|G|G  R|R|R|R  B|B|B|B
00917                     //
00918                     // Assuming sRGB affine matrix stored in fixed precision (lsb = 1/256)
00919                     // Trusting GCC to properly assign colorMatrix0-2 to d0-d2.  Direct reference seems to be broken on g++ 4.2.1 at least.
00920                     // r   colorMatrix0[0] [1] [2] [3]   r_in
00921                     // g = colorMatrix1[0] [1] [2] [3] * g_in
00922                     // b   colorMatrix2[0] [1] [2] [3]   b_in
00923 
00925 
00926                     "vmull.s16 q4, d5, d0[0] \n\t"
00927                     "vmlal.s16 q4, d4, d0[1] \n\t"
00928                     "vmlal.s16 q4, d6, d0[2] \n\t"
00929                     "vmlal.s16 q4, d20, d0[3] \n\t" 
00930 
00931                     "vmull.s16 q5, d5, d1[0] \n\t"
00932                     "vmlal.s16 q5, d4, d1[1] \n\t"
00933                     "vmlal.s16 q5, d6, d1[2] \n\t"
00934                     "vmlal.s16 q5, d20, d1[3] \n\t"
00935 
00936                     "vmull.s16 q6, d5, d2[0] \n\t"
00937                     "vmlal.s16 q6, d4, d2[1] \n\t"
00938                     "vmlal.s16 q6, d6, d2[2] \n\t"
00939                     "vmlal.s16 q6, d20, d2[3] \n\t"
00940 
00941                     //  d08  d09  d10  d11  d12  d13
00942                     //  R|R  R|R  G|G  G|G  B|B  B|B
00943 
00945                     "vrshrn.s32 d3, q4, #10  \n\t"
00946                     "vrshrn.s32 d4, q5, #10  \n\t"
00947                     "vrshrn.s32 d5, q6, #10  \n\t"
00949                     "vmin.s16 d3, d3, d21    \n\t"
00950                     "vmin.s16 d4, d4, d21    \n\t"
00951                     "vmin.s16 d5, d5, d21    \n\t"
00952                     "vmax.s16 d3, d3, d22    \n\t"
00953                     "vmax.s16 d4, d4, d22    \n\t"
00954                     "vmax.s16 d5, d5, d22    \n\t"
00955 
00956                     //    d3       d4       d2
00957                     // R|R|R|R  G|G|G|G  B|B|B|B
00958                     
00960                     "vmov r0,r1, d3                        \n\t"
00961                     //    r0       r1
00962                     // R16|R16  R16|R16
00963                     "uxth r2, r0                           \n\t" // Extract first red pixel into r2
00964                     "ldrb r4, [%[gammaTable], r2]          \n\t" // Table lookup, byte result
00965 
00966                     "uxth r2, r0, ROR #16                  \n\t"
00967                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00968                     "orr  r4, r4, r3, LSL #24              \n\t"
00969 
00970                     "uxth r2, r1                           \n\t"
00971                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00972                     "mov  r5, r3, LSL #16                  \n\t"
00973 
00974                     "uxth r2, r1, ROR #16                  \n\t"
00975                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00976                     "mov  r6, r3, LSL #8                   \n\t"
00977 
00978                     //   r4   r5   r6  
00979                     //  R__R __R_ _R__  -> increasing mem address (and increasing left shift)
00980 
00981                     "vmov r0,r1, d4                        \n\t"
00982                     //    r0       r1
00983                     // G16|G16  G16|G16
00984                     "uxth r2, r0                           \n\t"
00985                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00986                     "orr  r4, r4, r3, LSL #8               \n\t"
00987 
00988                     "uxth r2, r0, ROR #16                  \n\t"
00989                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00990                     "orr  r5, r5, r3                       \n\t"
00991 
00992                     "uxth r2, r1                           \n\t"
00993                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00994                     "orr  r5, r5, r3, LSL #24              \n\t"
00995 
00996                     "uxth r2, r1, ROR #16                  \n\t"
00997                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00998                     "orr  r6, r6, r3, LSL #16              \n\t"
00999 
01000                     //   r4   r5   r6  
01001                     //  RG_R G_RG _RG_  -> increasing mem address (and increasing left shift)
01002 
01003                     "vmov r0,r1, d5                        \n\t"
01004                     //    r0       r1
01005                     // B16|B16  B16|B16
01006                     "uxth r2, r0                           \n\t"
01007                     "ldrb r3, [%[gammaTable], r2]          \n\t"
01008                     "orr  r4, r4, r3, LSL #16              \n\t"
01009 
01010                     "uxth r2, r0, ROR #16                  \n\t"
01011                     "ldrb r3, [%[gammaTable], r2]          \n\t"
01012                     "orr  r5, r5, r3, LSL #8               \n\t"
01013 
01014                     "uxth r2, r1                           \n\t"
01015                     "ldrb r3, [%[gammaTable], r2]          \n\t"
01016                     "orr  r6, r6, r3                       \n\t"
01017 
01018                     "uxth r2, r1, ROR #16                  \n\t"
01019                     "ldrb r3, [%[gammaTable], r2]          \n\t"
01020                     "orr  r6, r6, r3, LSL #24              \n\t"
01021 
01022                     //   r4   r5   r6  
01023                     //  RGBR GBRG BRGB 
01024 
01025                     "stm %[dst]!, {r4,r5,r6}                   \n\t" // multi-store!
01026                     : [px0] "+&r" (px0),
01027                       [px1] "+&r" (px1),
01028                       [px2] "+&r" (px2),
01029                       [px3] "+&r" (px3),
01030                       [dst] "+&r" (dst)
01031                     : [gammaTable] "r" (lut),
01032                       [colorMatrix0] "w" (colorMatrix0), // Implicitly referenced only (d0)
01033                       [colorMatrix1] "w" (colorMatrix1), // Implicitly referenced only (d1)
01034                       [colorMatrix2] "w" (colorMatrix2), // Implicitly referenced only (d2)
01035                       [wCoord] "w" (wCoord),             // Implicitly referenced only (d20)
01036                       [maxValue] "w" (maxValue),         // Implicitly referenced only (d21)
01037                       [minValue] "w" (minValue)          // Implicitly referenced only (d22)
01038                     : "memory", 
01039                       "r0", "r1", "r2", "r3", "r4", "r5", "r6",
01040                       "d3", "d4", "d5", "d6",
01041                       "d7", "d8", "d9", "d10", 
01042                       "d11", "d12", "d13", "d14",
01043                       "d15", "d16", "d17", "d18", "d19"
01044                     );
01045 
01046             }            
01047         }
01048 
01049         
01050         //std::cout << "Done creating fast thumbnail. time = " << ((Time::now()-startTime)/1000) << std::endl;
01051 
01052         return thumb;
01053     }
01054     #endif
01055 
01056     // Generic RAW to thumbnail converter, roughly 680 ms on 5MP->640x480, N900
01057     Image makeThumbnailRAW(Frame src, const Size &thumbSize, float contrast, int blackLevel, float gamma) {
01058         
01059         // Special-case the N900 common case for speed (5 MP GRGB -> 640x480 RGB24)
01060     #ifdef __arm__
01061         if (src.image().width() == 2592 &&
01062             src.image().height() == 1968 &&
01063             thumbSize.width == 640 &&
01064             thumbSize.height == 480 &&
01065             src.bayerPattern() == GRBG) {
01066             return makeThumbnailRAW_N900(src, contrast, blackLevel, gamma);
01067         }
01068         #endif
01069 
01070         // Make the response curve
01071         unsigned char lut[4096];
01072         makeLUT(src, contrast, blackLevel, gamma, lut);
01073 
01074         Image thumb;
01076         bool redRowEven, blueRowGreenPixelEven;
01077         switch (src.bayerPattern()) {
01078         case RGGB:
01079             redRowEven = true;
01080             blueRowGreenPixelEven = true;
01081             break;
01082         case BGGR:
01083             redRowEven = false;
01084             blueRowGreenPixelEven = false;
01085             break;
01086         case GRBG:
01087             redRowEven = true;
01088             blueRowGreenPixelEven = false;
01089             break;
01090         case GBRG:
01091             redRowEven = false;
01092             blueRowGreenPixelEven = true;
01093             break;
01094         default:
01095             return thumb;
01096             break;
01097         }
01098 
01099         Time startTime = Time::now();
01100 
01101         thumb = Image(thumbSize, RGB24);
01102 
01103         unsigned int w = src.image().width();
01104         unsigned int h = src.image().height();
01105         unsigned int tw = thumbSize.width;
01106         unsigned int th = thumbSize.height;
01107         short maxPixel = 1023;
01108         short minPixel = 0;
01109         unsigned int scaleX = (int)std::floor((float)w / tw);
01110         unsigned int scaleY = (int)std::floor((float)h / th);
01111         unsigned int scale = std::min(scaleX, scaleY); // Maintain aspect ratio
01112         
01113         int cropX = (w-scale*tw)/2;
01114         if (cropX % 2 == 1) cropX--; // Ensure we're at start of 2x2 block
01115         int cropY = (h-scale*th)/2;
01116         if (cropY % 2 == 1) cropY--; // Ensure we're at start of 2x2 block
01117 
01118         float colorMatrix[12];
01119         src.rawToRGBColorMatrix(colorMatrix);
01120 
01121         /* A fast downsampling/demosaicing - average down color
01122            channels over the whole source pixel block under a single
01123            thumbnail pixel, and just use those colors directly as the
01124            pixel colors. */        
01125 
01126         for (unsigned int ty=0; ty < th; ty++) {
01127             unsigned char *tpix = thumb(0,ty); // Get a pointer to the beginning of the row
01128             for (unsigned int tx=0; tx < tw; tx++) {
01129                 unsigned int r=0,g=0,b=0;
01130                 unsigned int rc=0,gc=0,bc=0;
01131 
01132                 unsigned char *rowStart = src.image()(cropX + tx*scale, cropY + ty*scale);
01133 
01134                 bool isRedRow = ((ty*scale % 2 == 0) == redRowEven);
01135 
01136                 for(unsigned int i=0; i<scale; i++, rowStart+=src.image().bytesPerRow(), isRedRow = !isRedRow) {                    
01137                     unsigned short *px = (unsigned short *)rowStart;
01138                     if (isRedRow) {
01139                         bool isRed=((tx*scale)%2==0) == blueRowGreenPixelEven;
01140                         for (unsigned int j=0; j < scale; j++, isRed=!isRed) {
01141                             if (isRed) {
01142                                 r += *(px++);
01143                                 rc++;
01144                             } else {
01145                                 g += *(px++);
01146                                 gc++;
01147                             }
01148                         }
01149                     } else {
01150                         bool isGreen=((tx*scale)%2==0) == blueRowGreenPixelEven;
01151                         for (unsigned int j=0; j < scale; j++, isGreen=!isGreen) {
01152                             if (isGreen) {
01153                                 g += *(px++);
01154                                 gc++;
01155                             } else {
01156                                 b += *(px++);
01157                                 bc++;
01158                             }
01159                             
01160                         }
01161                     }
01162                 }
01163                 float r_sensor = ((float)r / rc);
01164                 float g_sensor = ((float)g / gc);
01165                 float b_sensor = ((float)b / bc);
01166                 float r_srgb = (r_sensor * colorMatrix[0] +
01167                                 g_sensor * colorMatrix[1] +
01168                                 b_sensor * colorMatrix[2] +
01169                                 colorMatrix[3]);
01170                 float g_srgb = (r_sensor * colorMatrix[4] +
01171                                 g_sensor * colorMatrix[5] +
01172                                 b_sensor * colorMatrix[6] +
01173                                 colorMatrix[7]);
01174                 float b_srgb = (r_sensor * colorMatrix[8] +
01175                                 g_sensor * colorMatrix[9] +
01176                                 b_sensor * colorMatrix[10] +
01177                                 colorMatrix[11]);
01178                 unsigned short r_linear = std::min(maxPixel,std::max(minPixel,(short int)std::floor(r_srgb+0.5)));
01179                 unsigned short g_linear = std::min(maxPixel,std::max(minPixel,(short int)std::floor(g_srgb+0.5)));
01180                 unsigned short b_linear = std::min(maxPixel,std::max(minPixel,(short int)std::floor(b_srgb+0.5)));
01181                 *(tpix++) = lut[r_linear];
01182                 *(tpix++) = lut[g_linear];
01183                 *(tpix++) = lut[b_linear];
01184             }
01185         }
01186 
01187         //std::cout << "Done creating thumbnail. time = " << ((Time::now()-startTime)/1000) << std::endl;
01188 
01189         return thumb;
01190 
01191     }
01192 
01193     Image makeThumbnail(Frame src, const Size &thumbSize, float contrast, int blackLevel, float gamma) {
01194         Image thumb;
01195 
01196         // Sanity checks
01197         if (not src.image().valid()) return thumb;
01198         if (thumbSize.width == 0 or thumbSize.height == 0) return thumb;
01199 
01200         switch (src.image().type()) {
01201         case RAW:
01202             thumb = makeThumbnailRAW(src, thumbSize, contrast, blackLevel, gamma);
01203             break;
01204         case RGB24:
01205         case RGB16:
01206         case UYVY:
01207         case YUV24:
01208         case UNKNOWN:
01209         default:
01211             // These formats can't be thumbnailed yet
01212             break;
01213         }
01214         return thumb;
01215     }
01216 
01217 }