vp8/encoder/arm/neon/shortfdct_neon.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12     EXPORT  |vp8_short_fdct4x4_neon|
  13     EXPORT  |vp8_short_fdct8x4_neon|
  14     ARM
  15     REQUIRE8
  16     PRESERVE8
  17
  18
  19     AREA ||.text||, CODE, READONLY, ALIGN=2
  20
  21 ; r0    short *input
  22 ; r1    short *output
  23 ; r2    int pitch
  24 ; Input has a pitch, output is contiguous
  25 |vp8_short_fdct4x4_neon| PROC
  26     ldr             r12, _dct_matrix_
  27     vld1.16         d0, [r0], r2
  28     vld1.16         d1, [r0], r2
  29     vld1.16         d2, [r0], r2
  30     vld1.16         d3, [r0]
  31     vld1.16         {q2, q3}, [r12]
  32
  33 ;first stage
  34     vmull.s16       q11, d4, d0[0]              ;i=0
  35     vmull.s16       q12, d4, d1[0]              ;i=1
  36     vmull.s16       q13, d4, d2[0]              ;i=2
  37     vmull.s16       q14, d4, d3[0]              ;i=3
  38
  39     vmlal.s16       q11, d5, d0[1]
  40     vmlal.s16       q12, d5, d1[1]
  41     vmlal.s16       q13, d5, d2[1]
  42     vmlal.s16       q14, d5, d3[1]
  43
  44     vmlal.s16       q11, d6, d0[2]
  45     vmlal.s16       q12, d6, d1[2]
  46     vmlal.s16       q13, d6, d2[2]
  47     vmlal.s16       q14, d6, d3[2]
  48
  49     vmlal.s16       q11, d7, d0[3]              ;sumtemp for i=0
  50     vmlal.s16       q12, d7, d1[3]              ;sumtemp for i=1
  51     vmlal.s16       q13, d7, d2[3]              ;sumtemp for i=2
  52     vmlal.s16       q14, d7, d3[3]              ;sumtemp for i=3
  53
  54     ; rounding
  55     vrshrn.i32      d22, q11, #14
  56     vrshrn.i32      d24, q12, #14
  57     vrshrn.i32      d26, q13, #14
  58     vrshrn.i32      d28, q14, #14
  59
  60 ;second stage
  61     vmull.s16       q4, d22, d4[0]              ;i=0
  62     vmull.s16       q5, d22, d4[1]              ;i=1
  63     vmull.s16       q6, d22, d4[2]              ;i=2
  64     vmull.s16       q7, d22, d4[3]              ;i=3
  65
  66     vmlal.s16       q4, d24, d5[0]
  67     vmlal.s16       q5, d24, d5[1]
  68     vmlal.s16       q6, d24, d5[2]
  69     vmlal.s16       q7, d24, d5[3]
  70
  71     vmlal.s16       q4, d26, d6[0]
  72     vmlal.s16       q5, d26, d6[1]
  73     vmlal.s16       q6, d26, d6[2]
  74     vmlal.s16       q7, d26, d6[3]
  75
  76     vmlal.s16       q4, d28, d7[0]              ;sumtemp for i=0
  77     vmlal.s16       q5, d28, d7[1]              ;sumtemp for i=1
  78     vmlal.s16       q6, d28, d7[2]              ;sumtemp for i=2
  79     vmlal.s16       q7, d28, d7[3]              ;sumtemp for i=3
  80
  81     vrshr.s32       q0, q4, #16
  82     vrshr.s32       q1, q5, #16
  83     vrshr.s32       q2, q6, #16
  84     vrshr.s32       q3, q7, #16
  85
  86     vmovn.i32       d0, q0
  87     vmovn.i32       d1, q1
  88     vmovn.i32       d2, q2
  89     vmovn.i32       d3, q3
  90
  91     vst1.16         {q0, q1}, [r1]
  92
  93     bx              lr
  94
  95     ENDP
  96
  97 ; r0    short *input
  98 ; r1    short *output
  99 ; r2    int pitch
 100 |vp8_short_fdct8x4_neon| PROC
 101     ; Store link register and input before calling
 102     ;  first 4x4 fdct.  Do not need to worry about
 103     ;  output or pitch because those pointers are not
 104     ;  touched in the 4x4 fdct function
 105     stmdb           sp!, {r0, lr}
 106
 107     bl              vp8_short_fdct4x4_neon
 108
 109     ldmia           sp!, {r0, lr}
 110
 111     ; Move to the next block of data.
 112     add             r0, r0, #8
 113     add             r1, r1, #32
 114
 115     ; Second time through do not store off the
 116     ;  link register, just return from the 4x4 fdtc
 117     b               vp8_short_fdct4x4_neon
 118
 119     ; Should never get to this.
 120     bx              lr
 121
 122     ENDP
 123
 124 ;-----------------
 125
 126 _dct_matrix_
 127     DCD     dct_matrix
 128 dct_matrix
 129 ;   DCW     23170,  30274,  23170, 12540
 130 ;   DCW     23170,  12540, -23170,-30274
 131 ;   DCW     23170, -12540, -23170, 30274
 132 ;   DCW     23170, -30274,  23170,-12540
 133 ; 23170 =  0x5a82
 134 ; -23170 =  0xa57e
 135 ; 30274 =  0x7642
 136 ; -30274 =  0x89be
 137 ; 12540 =  0x30fc
 138 ; -12540 = 0xcf04
 139     DCD     0x76425a82, 0x30fc5a82
 140     DCD     0x30fc5a82, 0x89bea57e
 141     DCD     0xcf045a82, 0x7642a57e
 142     DCD     0x89be5a82, 0xcf045a82
 143
 144     END