Reply by Richard Williams●December 3, 20102010-12-03
Sun,
I think the use of the _nassert() needs a re-read.
And the _nassert() calls only needs to be executed once,
at the begining of the function to assure the alignment of the passed in
pointer parameters and the sizing of the width and height parameters.
Any and all other _nassert() calls are unneeded and just waste CPU cycles.
More than likely, the compiler will not have enough registers available for
the number of 'register' modifiers listed in this function.
Therefore, *I* would only place the 'register' modifier on those
variables
which are most heavily used.
R. Williams
---------- Original Message -----------
From: "shengkai.sun"
To: "c6x"
Sent: Fri, 3 Dec 2010 16:58:40 +0800
Subject: [c6x] Re: Image Rotate Optimize
> Thansk for your reply, Williams!
>
> I have modified the code according to what you said, but the
> performance is also the same. I checked the asm code and maybe the
> bottleneck lies in data fetching. Here is the modified code:
>
> #define ALIGNED_ARRAY8(ptr) _nassert((int)ptr % 8 == 0)
> #define ALIGNED_ARRAY4(ptr) _nassert((int)ptr % 4 == 0)
> int VixEye_format_yyuuyyvv2planaryuvRoate90(char *restrict pInBuf,
> char *restrict pOutBuf, const int nWidth, const int nHeight, int
> nRightLeft) { int i, j;
>
> //src pionters
> register char *restrict pSrcRow0 = pInBuf;
> register char *restrict pSrcRow1 = pInBuf;
> register char *restrict pSrcRow2 = pInBuf;
> register char *restrict pSrcRow3 = pInBuf;
> register char *restrict pSrcRow4 = pInBuf;
> register char *restrict pSrcRow5 = pInBuf;
> register char *restrict pSrcRow6 = pInBuf;
> register char *restrict pSrcRow7 = pInBuf;
>
> //dst pionters
> register char *restrict pY0;
> register char *restrict pU0;
> register char *restrict pV0;
> register unsigned int nOffset = nHeight;
> register unsigned int nPixel0, nPixel1, nPixel2, nPixel3, nPixel4,
> nPixel5, nPixel6, nPixel7; register unsigned int nTemp0, nTemp1;
> register int nYuvWidth = (nWidth << 1);
>
> pY0 = pOutBuf + nHeight - 4;
> pU0 = pOutBuf + nWidth * nHeight + (nHeight >> 1) - 4;
> pV0 = pOutBuf + nWidth * nHeight + (nWidth * nHeight >> 1) + (nHeight
> >> 1) - 4;
>
> ALIGNED_ARRAY8(pInBuf);
> ALIGNED_ARRAY8(pOutBuf);
> ALIGNED_ARRAY4(pSrcRow0);
> ALIGNED_ARRAY4(pSrcRow1);
> ALIGNED_ARRAY4(pSrcRow2);
> ALIGNED_ARRAY4(pSrcRow3);
> ALIGNED_ARRAY4(pY0);
> ALIGNED_ARRAY4(pU0);
> ALIGNED_ARRAY4(pV0);
> //#pragma MUST_ITERATE(AOI_WIDTH/4,FF_WIDTH/4,16); //AOI_WIDTH = 752,
> FF_WIDTH = 1600
> #pragma UNROLL(8);
> for(i = 0; i < nYuvWidth; i += 4)
> {
> //src points to the next 4 columns
> pSrcRow0 = pInBuf + i; //first row
> pSrcRow1 = pSrcRow0 + nYuvWidth; //second row;
> pSrcRow2 = pSrcRow1 + nYuvWidth; //3rd row;
> pSrcRow3 = pSrcRow2 + nYuvWidth; //4th row
> pSrcRow4 = pSrcRow3 + nYuvWidth; //5th row;
> pSrcRow5 = pSrcRow4 + nYuvWidth; //6th row;
> pSrcRow6 = pSrcRow5 + nYuvWidth; //7th row
> pSrcRow7 = pSrcRow6 + nYuvWidth; //8th row
> ALIGNED_ARRAY4(pSrcRow0);
> ALIGNED_ARRAY4(pSrcRow1);
> ALIGNED_ARRAY4(pSrcRow2);
> ALIGNED_ARRAY4(pSrcRow3);
> ALIGNED_ARRAY4(pSrcRow4);
> ALIGNED_ARRAY4(pSrcRow5);
> ALIGNED_ARRAY4(pSrcRow6);
> ALIGNED_ARRAY4(pSrcRow7);
> ALIGNED_ARRAY4(pY0);
> ALIGNED_ARRAY4(pU0);
> ALIGNED_ARRAY4(pV0);
> //#pragma MUST_ITERATE(AOI_HEIGHT/8,FF_HEIGHT/8,2); //AOI_HEIGHT = 480,
> FF_HEIGHT = 1200
> #pragma UNROLL(2);
> for(j = 0; j < nHeight; j += 8)
> {
> nPixel0 = _mem4(pSrcRow0); //u1u0y1y0
> nPixel1 = _mem4(pSrcRow1); //v1v0y3y2
> nOffset = nHeight; //length of Y after rotation
> nTemp0 = _pack2(nPixel0, nPixel1); //y1y0y3y2
>
> nPixel2 = _mem4(pSrcRow2); //u3u2y5y4
> nPixel3 = _mem4(pSrcRow3); //v3v2y7y6
> nTemp1 = _pack2(nPixel2, nPixel3); //y5y4y7y6
> _mem4(pY0) = _packl4(nTemp0, nTemp1); //y0y2y4y6, row1 after rotate,
> little endian _mem4(pY0 + nOffset) = _packh4(nTemp0, nTemp1);
> //y1y3y5y7, row2 after rotate, little endian
>
> pY0 -= 4;
> nPixel4 = _mem4(pSrcRow4); //u5u4y9y8
> nPixel5 = _mem4(pSrcRow5); //v5v4y11y10
> nTemp0 = _pack2(nPixel4, nPixel5); //y9y8y11y10
> nPixel6 = _mem4(pSrcRow6); //u7u6y13y12
> nPixel7 = _mem4(pSrcRow7); //v7v6y15y14
> nTemp1 = _pack2(nPixel6, nPixel7); //y13y12y15y14
> _mem4(pY0) = _packl4(nTemp0, nTemp1); //y8y10y12y14, row1 after rotate,
> little endian _mem4(pY0 + nOffset) = _packh4(nTemp0, nTemp1);
> //y9y11y13y15, row2 after rotate, little endian
>
> nOffset >>= 1; //divided by 2, because width of U, V is 1/2 of Y
> nTemp0 = _packh2(nPixel0, nPixel2); //u1u0u3u2
> nTemp1 = _packh2(nPixel4, nPixel6); //u5u4u7u6
> _mem4(pU0) = _packl4(nTemp0, nTemp1); //u0u2u4u6, row1 after rotate,
> little endian _mem4(pU0 + nOffset) = _packh4(nTemp0, nTemp1);
> //u1u3u5u7, row2 after rotate, little endian
>
> nTemp0 = _packh2(nPixel1, nPixel3); //v1v0v3v2
> nTemp1 = _packh2(nPixel5, nPixel7); //v5v4v7v6
> _mem4(pV0) = _packl4(nTemp0, nTemp1); //v0v2v4v6
> _mem4(pV0 + nOffset) = _packh4(nTemp0, nTemp1); //v1v3vv5v7
>
> nOffset = nYuvWidth << 3;
> pSrcRow0 += nOffset; //jump 8 rows
> pSrcRow1 += nOffset;
> pSrcRow2 += nOffset;
> pSrcRow3 += nOffset;
> pSrcRow5 += nOffset;
> pSrcRow6 += nOffset;
> pSrcRow7 += nOffset;
> pSrcRow4 += nOffset;
> pY0 -= 4; //go to previous column
> pU0 -= 4;
> pV0 -= 4;
> }
>
> //Dst points to next 2 rows, and the last column
> pY0 += 3 * nHeight;
> pU0 += (3 * nHeight >> 1);
> pV0 += (3 * nHeight >> 1);
> }
> }
>
> The followings are the generated asm code,
> Is there anything I can do to improve the effciency?
> Thanks!!
>
>
;**************************************************************************** ** > ;* FUNCTION NAME: yyuuyyvv2planaryuvRoate90
*
> ;*
> * ;* Regs Modified : A0,A1,A3,A4,A5,A6,A7,A8,A9,B0,B1,B2,
> B4,B5,B6,B7,B8, * ;* B9,A16,A17,A18,A19,A20,
> A21,A22,A23,A24,A25,A26, * ;* A27,A28,A29,
> A30,A31,B16,B17,B18,B19,B20,B21,B22, * ;*
> B23,B24,B25,B26,B27,B30,B31 * ;* Regs Used
> : A0,A1,A3,A4,A5,A6,A7,A8,A9,B0,B1,B2,B3,B4,B5,B6,B7, * ;*
> B8,B9,A16,A17,A18,A19,A20,A21,A22,A23,A24,A25, *
> ;* A26,A27,A28,A29,A30,A31,B16,B17,B18,B19,
> B20,B21, * ;* B22,B23,B24,B25,B26,B27,B30,
> B31 * ;* Local Frame Size : 0 Args + 0 Auto + 0
> Save = 0 byte *
;****************************************************************************
**
> .dwpsn file "YUVRotate.c",line 226,column
0,is_stmt
>
> 2010-12-03
>
> shengkai.sun ------- End of Original Message -------
_____________________________________
Reply by "shengkai.sun"●December 3, 20102010-12-03
Thansk for your reply, Williams!
I have modified the code according to what you said, but the performance is also
the same. I checked the asm code and maybe the bottleneck lies in data fetching.
Here is the modified code:
#define ALIGNED_ARRAY8(ptr) _nassert((int)ptr % 8 == 0)
#define ALIGNED_ARRAY4(ptr) _nassert((int)ptr % 4 == 0)
int VixEye_format_yyuuyyvv2planaryuvRoate90(char *restrict pInBuf, char
*restrict pOutBuf, const int nWidth, const int nHeight, int nRightLeft)
{
int i, j;
nOffset >>= 1; //divided by 2, because width of U, V is 1/2 of Y
nTemp0 = _packh2(nPixel0, nPixel2); //u1u0u3u2
nTemp1 = _packh2(nPixel4, nPixel6); //u5u4u7u6
_mem4(pU0) = _packl4(nTemp0, nTemp1); //u0u2u4u6, row1 after rotate, little
endian
_mem4(pU0 + nOffset) = _packh4(nTemp0, nTemp1); //u1u3u5u7, row2 after rotate,
little endian
Reply by Richard Williams●December 1, 20102010-12-01
shengkai,
Adding a couple of 'nassert' statement at the beginning of the
function to
assure the data is appropriately aligned in memory (the data should be
aligned
on a 8 byte boundary)
Adding a couple of 'nassert' statements at the beginning of the
function to
assure the data size is a multiple of 8
Adding a 'unroll' pragma at the top of each loop.
adding a 'restricted' modifier to the images pointers in the passed
parameters.
adding a 'register' modifier to the local/automatic image pointer
definitions.
Your code seems to be assuming that the incoming image will contain a number
of
rows that will be a multiple of 4.
Adding a 'nassert' statement that assures this is a fact.
adding the appropriate statements so the DSP internal loop buffer will be
used.
(this may require modifying the code to have 3 loops rather than just one)
The FF_WIDTH and FF_HEIGHT values used in the 'must itterate' pragma
seem
unrelated to the actual image size. This will result in problems when the
image
size is not a match for the FF_WIDTH and FF_HEIGHT values.
R. Williams
---------- Original Message -----------
From: s...@gmail.com
To: c...
Sent: Tue, 30 Nov 2010 21:34:28 -0500
Subject: [c6x] Image Rotate Optimize
> Hi, there!
>
> I am trying to rotate a YUV image 90 degree clockwise, but the
> effeciency is not good enough, please help me...
>
> Here is the input image data:
> 1st row:y00 y01 u00 u01 y02 y03 u02 u03 ....
> 2nd row:y10 y11 v10 v11 y12 y13 v12 v13 ....
> 3rd row:y20 y21 u20 u21 y22 y23 u22 u23 ....
> 4th row:y30 y31 v30 v31 y32 y33 v32 v33 ....
> ... ...
>
> u00 and y00 correspond to the first pixel,u01 and y01 correspond to
> the next pixel, and so on. The data of near rows are continously saved
> at RAM.
>
> What I want to do is rotate the image clockwise for 90 degree, and
> seperate Y, U, V for YUV422 format, so after rotation, the 1st column
> of original image should be placed at the 1st line, and the 1st row of
> original image should be placed at the last column, the data should
> look like this:
>
> Y:
> 1st row: ... ... y30 y20 y10 y00
> 2nd row: ... ... y31 y21 y11 y01
> 3rd row: ... ... y32 y22 y12 y02
> 4th row: ... ... y33 y23 y13 y03
> ... ...
> The length of a row is same as the Height of original image
> The size of rows the same as the Width of original image
>
> U:
> 1st row: ... ... u60 u40 u20 u00
> 2nd row: ... ... u61 u41 u21 u01
> ... ...
> The length of a row is a half of the Height of original image
> The size of rows the same as the Width of original image
>
> V:
> 1st row: ... ... v70 v50 v30 v10
> 2nd row: ... ... v71 v51 v31 v11
> ... ...
> The length of a row is a half of the Height of original image
> The size of rows the same as the Width of original image
>
> I'm using little endian, DM642 at 720MHz, and the SDRAM is 133MHz;
>
> Here is my code:
> int yyuuyyvv2planaryuvRoate90(char * pInBuf,char * pOutBuf, const int
> nWidth, const int nHeight, int nRightLeft) { int i, j;
> #if 0 //original code, without optimization
> char *restrict pSrc, *restrict pY, *restrict pU, *restrict pV;
> pSrc = pInBuf;
> //the last column of the first row after rotation
> pY = pOutBuf + nHeight - 1;
> pU = pOutBuf + nHeight * nWidth + (nHeight >> 1) - 1;
> pV = pOutBuf + nHeight * nWidth + (nHeight * nWidth >> 1) + (nHeight
> >> 1) - 1; for(i = 0; i < (nWidth << 1); i += 4) { pSrc = pInBuf +
> i; for(j = 0; j < nHeight; j += 2) { *pY = *pSrc; //y0 *(pY
> + nHeight) = *(pSrc + 1); //y1 *pU = *(pSrc + 2); *(pU +
> (nHeight >> 1)) = *(pSrc + 3); pSrc += (nWidth << 1); *(pY - 1)
> = *pSrc; *(pY + nHeight - 1) = *(pSrc + 1); *pV = *(pSrc + 2);
> *(pV + (nHeight >> 1)) = *(pSrc + 3); pY -= 2; pU--; pV--;
> pSrc += (nWidth << 1); } pY += (3 * nHeight); pU += 3 *
> (nHeight >> 1); pV += 3 * (nHeight >> 1); }
> #else //Optimized code
> //Src, Rotate 90 degree clockwise //counterclockwise
> /* little endian
> u1u0y1y0... ...
> v1v0y3y2... ...
> u3u2y5y4 ...
> v3v2y7y6 ...
> ... ...
> */
>
> //src pionters
> char *restrict pSrcRow0, *restrict pSrcRow1, *restrict pSrcRow2,
> *restrict pSrcRow3; //dst pionters char *restrict pY0, *restrict
> pY1; char *restrict pU0, *restrict pU1; char *restrict pV0,
> *restrict pV1; //temporary variables unsigned int nY0, nY1;
> unsigned int nPixel0, nPixel1, nPixel2, nPixel3; int nYuvWidth > (nWidth <<
1); int nYRowStep = 3 * nHeight; int nYColumnStep > nYuvWidth * 4; int
nUVStep = 3 * (nHeight >> 1); //init pointers
> pY0 = pOutBuf + nHeight - 4; //last column of the first row pY1 > pOutBuf +
2 * nHeight - 4;; //same column but the next row of pY0 pU0
> = pOutBuf + nHeight * nWidth + (nHeight >> 1) - 1; //last column of
> the first row pU1 = pU0 + (nHeight >> 1); //same column but the next
> row of pU0 pV0 = pOutBuf + nHeight * nWidth + (nHeight * nWidth >> 1)
> + (nHeight >> 1) - 1; //last column of the first row pV1 = pV0 +
> (nHeight >> 1); //same column but the next row of pV0 #pragma
> MUST_ITERATE(FF_WIDTH/2,FF_WIDTH/2,16); for(i = 0; i < nYuvWidth; i
> += 4) { pSrcRow0 = pInBuf + i; //first row pSrcRow1 = pSrcRow0 +
> nYuvWidth; //second row; pSrcRow2 = pSrcRow1 + nYuvWidth; //3rd row;
> pSrcRow3 = pSrcRow2 + nYuvWidth; //4th row #pragma
> MUST_ITERATE(FF_HEIGHT/2,FF_HEIGHT/2,8); for(j = 0; j < nHeight; j
> += 4) { nPixel0 = _mem4(pSrcRow0); //u1u0y1y0 nPixel1 >
_mem4(pSrcRow1); //v1v0y3y2 nPixel2 = _mem4(pSrcRow2); //u3u2y5y4
> nPixel3 = _mem4(pSrcRow3); //v3v2y7y6 nY0 = _pack2(nPixel0,
> nPixel1); //y1y0y3y2 nY1 = _pack2(nPixel2, nPixel3); //y5y4y7y6
> _mem4(pY0) = _packl4(nY0, nY1); //y0y2y4y6, row1 after rotate, little
endian
> _mem4(pY1) = _packh4(nY0, nY1); //y1y3y5y7, row2 after rotate,
> little endian
>
> *pU0-- = *(pSrcRow0 + 2);
> *pU0-- = *(pSrcRow2 + 2);
> *pU1-- = *(pSrcRow0 + 3);
> *pU1-- = *(pSrcRow2 + 3);
> *pV0-- = *(pSrcRow1 + 2);
> *pV0-- = *(pSrcRow3 + 2);
> *pV1-- = *(pSrcRow1 + 3);
> *pV1-- = *(pSrcRow3 + 3);
> pSrcRow0 += nYColumnStep; //jump 4 rows
> pSrcRow1 += nYColumnStep;
> pSrcRow2 += nYColumnStep;
> pSrcRow3 += nYColumnStep;
> pY0 -= 4; //go to previous column
> pY1 -= 4;
> }
> pY0 += nYRowStep; //go to the last column of the 3rd row below
> current row pY1 += nYRowStep; pU0 += nUVStep; pU1 += nUVStep;
> pV0 += nUVStep; pV1 += nUVStep; }
> #endif
> return 0;
> }
>
> I have little experience with C64x optimization, please tell me what
> should I do for further optimization.
>
> Thanks very much!
>
> Sincerely yours,
> Eric Sun
>
>
>
> _____________________________________
_____________________________________
Reply by shen...@gmail.com●December 1, 20102010-12-01
Hi, there!
I am trying to rotate a YUV image 90 degree clockwise, but the effeciency is not
good enough, please help me...
u00 and y00 correspond to the first pixel,u01 and y01 correspond to the next
pixel, and so on. The data of near rows are continously saved at RAM.
What I want to do is rotate the image clockwise for 90 degree, and seperate Y,
U, V for YUV422 format, so after rotation, the 1st column of original image
should be placed at the 1st line, and the 1st row of original image should be
placed at the last column, the data should look like this:
Y:
1st row: ... ... y30 y20 y10 y00
2nd row: ... ... y31 y21 y11 y01
3rd row: ... ... y32 y22 y12 y02
4th row: ... ... y33 y23 y13 y03
... ...
The length of a row is same as the Height of original image
The size of rows the same as the Width of original image
U:
1st row: ... ... u60 u40 u20 u00
2nd row: ... ... u61 u41 u21 u01
... ...
The length of a row is a half of the Height of original image
The size of rows the same as the Width of original image
V:
1st row: ... ... v70 v50 v30 v10
2nd row: ... ... v71 v51 v31 v11
... ...
The length of a row is a half of the Height of original image
The size of rows the same as the Width of original image
I'm using little endian, DM642 at 720MHz, and the SDRAM is 133MHz;