3 * This code is heavily based on the HiTag2 Hell CPU implementation
4 * from https://github.com/factoritbv/hitag2hell by FactorIT B.V.,
5 * with the following changes:
6 * * Main takes a UID and 2 {nR},{aR} pairs as arguments
7 * and searches for states producing the first aR sample,
8 * reconstructs the corresponding key candidates
9 * and tests them against the second nR,aR pair;
10 * * Reduce max_bitslices and some type sizes to fit OpenCL
11 * * Reuses the Hitag helping functions of the other attacks.
23 #include <sys/types.h>
26 #include <OpenCL/opencl.h>
28 #define CL_TARGET_OPENCL_VERSION 220
29 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
32 #include "ht2crackutils.h"
34 const uint8_t bits
[9] = {20, 14, 4, 3, 1, 1, 1, 1, 1};
35 #define lfsr_inv(state) (((state)<<1) | (__builtin_parityll((state) & ((0xce0044c101cd>>1)|(1ull<<(47))))))
36 #define i4(x,a,b,c,d) ((uint32_t)((((x)>>(a))&1)<<3)|(((x)>>(b))&1)<<2|(((x)>>(c))&1)<<1|(((x)>>(d))&1))
37 #define f(state) ((0xdd3929b >> ( (((0x3c65 >> i4(state, 2, 3, 5, 6) ) & 1) <<4) \
38 | ((( 0xee5 >> i4(state, 8,12,14,15) ) & 1) <<3) \
39 | ((( 0xee5 >> i4(state,17,21,23,26) ) & 1) <<2) \
40 | ((( 0xee5 >> i4(state,28,29,31,33) ) & 1) <<1) \
41 | (((0x3c65 >> i4(state,34,43,44,46) ) & 1) ))) & 1)
43 #define MAX_BITSLICES 32
44 #define VECTOR_SIZE (MAX_BITSLICES/8)
45 #define KERNELFILENAME "ht2crack5kernel.cl"
47 typedef unsigned int __attribute__((aligned(VECTOR_SIZE
))) __attribute__((vector_size(VECTOR_SIZE
))) bitslice_value_t
;
49 bitslice_value_t value
;
50 uint8_t bytes
[MAX_BITSLICES
/ 8];
53 // we never actually set or use the lowest 2 bits the initial state, so we can save 2 bitslices everywhere
54 __thread bitslice_t state
[-2 + 32 + 48];
56 bitslice_t keystream
[32];
57 bitslice_t bs_zeroes
, bs_ones
;
59 #define f_a_bs(a,b,c,d) (~(((a|b)&c)^(a|d)^b)) // 6 ops
60 #define f_b_bs(a,b,c,d) (~(((d|c)&(a^b))^(d|a|b))) // 7 ops
61 #define f_c_bs(a,b,c,d,e) (~((((((c^e)|d)&a)^b)&(c^b))^(((d^e)|a)&((d^b)|c)))) // 13 ops
62 #define lfsr_bs(i) (state[-2+i+ 0].value ^ state[-2+i+ 2].value ^ state[-2+i+ 3].value ^ state[-2+i+ 6].value ^ \
63 state[-2+i+ 7].value ^ state[-2+i+ 8].value ^ state[-2+i+16].value ^ state[-2+i+22].value ^ \
64 state[-2+i+23].value ^ state[-2+i+26].value ^ state[-2+i+30].value ^ state[-2+i+41].value ^ \
65 state[-2+i+42].value ^ state[-2+i+43].value ^ state[-2+i+46].value ^ state[-2+i+47].value);
66 #define get_bit(n, word) ((word >> (n)) & 1)
68 static uint64_t expand(uint64_t mask
, uint64_t value
) {
70 for (uint64_t bit_index
= 0; bit_index
< 48; bit_index
++) {
72 fill
|= (value
& 1) << bit_index
;
80 static void bitslice(const uint64_t value
, bitslice_t
*restrict bitsliced_value
, const size_t bit_len
, bool reverse
) {
82 for (bit_idx
= 0; bit_idx
< bit_len
; bit_idx
++) {
85 bit
= get_bit(bit_len
- 1 - bit_idx
, value
);
87 bit
= get_bit(bit_idx
, value
);
90 bitsliced_value
[bit_idx
].value
= bs_ones
.value
;
92 bitsliced_value
[bit_idx
].value
= bs_zeroes
.value
;
97 uint32_t uid
, nR1
, aR1
, nR2
, aR2
;
99 // Reduce type size of candidates array to fit OpenCL
100 uint16_t candidates
[(1 << 20) * 3];
101 bitslice_t initial_bitslices
[48];
102 size_t filter_pos
[20] = {4, 7, 9, 13, 16, 18, 22, 24, 27, 30, 32, 35, 45, 47 };
103 size_t thread_count
= 8;
104 size_t layer_0_found
;
106 static void try_state(uint64_t s
);
109 char *kernelSource
; // source for kernel
111 cl_platform_id platform_id
; // compute platform id
112 cl_device_id device_id
; // compute device id
113 cl_context context
; // compute context
114 cl_command_queue commands
; // compute command queue
115 cl_program program
; // compute program
116 cl_kernel kernel
; // compute kernel
118 // cl_mem cand_base; // device memory used for the candidate base
119 cl_mem keystream
; // device memory used for the keystream array
120 cl_mem candidates
; // device memory used for the candidates array
121 cl_mem matches
; // device memory used for the matches array
122 cl_mem matches_found
; // device memory used for the matches_found array
126 static void runKernel(struct context
*ctx
, uint32_t cand_base
, uint64_t *matches
, uint32_t *matches_found
) {
130 // Write our data set into the input array in device memory
131 err
= clEnqueueWriteBuffer(ctx
->commands
, ctx
->matches_found
, CL_TRUE
, 0, sizeof(uint32_t), matches_found
, 0, NULL
, NULL
);
132 if (err
!= CL_SUCCESS
) {
133 printf("Error: Failed to enque kernel writebuffer in runKernel! %d\n", err
);
137 // Set the arguments to our compute kernel
138 err
= clSetKernelArg(ctx
->kernel
, 0, sizeof(uint32_t), &cand_base
);
139 err
|= clSetKernelArg(ctx
->kernel
, 4, sizeof(cl_mem
), &ctx
->matches_found
);
140 if (err
!= CL_SUCCESS
) {
141 printf("Error: Failed to set kernel arguments in runKernel! %d\n", err
);
145 // Execute the kernel over the entire range of our 2d input data set using 8K * 1K threads
148 err
= clEnqueueNDRangeKernel(ctx
->commands
, ctx
->kernel
, 2, NULL
, global
, NULL
, 0, NULL
, NULL
);
150 printf("Error: Failed to execute kernel!\n");
154 // Wait for the command commands to get serviced before reading back results
155 err
= clFinish(ctx
->commands
);
157 printf("Error: Failed to execute kernel! clFinish = %d\n", err
);
161 // Read back the results from the device to verify the output
162 err
= clEnqueueReadBuffer(ctx
->commands
, ctx
->matches
, CL_TRUE
, 0, sizeof(uint64_t) * 8192, matches
, 0, NULL
, NULL
);
163 if (err
!= CL_SUCCESS
) {
164 printf("Error: Failed to read matches array! %d\n", err
);
168 err
= clEnqueueReadBuffer(ctx
->commands
, ctx
->matches_found
, CL_TRUE
, 0, sizeof(uint32_t), matches_found
, 0, NULL
, NULL
);
169 if (err
!= CL_SUCCESS
) {
170 printf("Error: Failed to read matches_found! %d\n", err
);
175 int main(int argc
, char *argv
[]) {
176 memset(candidates
, 0, sizeof(candidates
));
178 uint64_t matches
[8192];
179 uint32_t matches_found
[1];
182 memset(bs_ones
.bytes
, 0xff, VECTOR_SIZE
);
183 memset(bs_zeroes
.bytes
, 0x00, VECTOR_SIZE
);
188 printf("%s UID {nR1} {aR1} {nR2} {aR2}\n", argv
[0]);
192 if (!strncmp(argv
[1], "0x", 2) || !strncmp(argv
[1], "0X", 2)) {
193 uid
= rev32(hexreversetoulong(argv
[1] + 2));
195 uid
= rev32(hexreversetoulong(argv
[1]));
198 if (!strncmp(argv
[2], "0x", 2) || !strncmp(argv
[2], "0X", 2)) {
199 nR1
= rev32(hexreversetoulong(argv
[2] + 2));
201 nR1
= rev32(hexreversetoulong(argv
[2]));
204 aR1
= strtol(argv
[3], NULL
, 16);
206 if (!strncmp(argv
[4], "0x", 2) || !strncmp(argv
[4], "0X", 2)) {
207 nR2
= rev32(hexreversetoulong(argv
[4] + 2));
209 nR2
= rev32(hexreversetoulong(argv
[4]));
212 aR2
= strtol(argv
[5], NULL
, 16);
215 // bitslice inverse target bits
216 bitslice(~target
, keystream
, 32, true);
218 // bitslice all possible 256 values in the lowest 8 bits
219 memset(initial_bitslices
[0].bytes
, 0xaa, VECTOR_SIZE
);
220 memset(initial_bitslices
[1].bytes
, 0xcc, VECTOR_SIZE
);
221 memset(initial_bitslices
[2].bytes
, 0xf0, VECTOR_SIZE
);
223 for (size_t bit
= 3; bit
< 8; bit
++) {
224 for (size_t byte
= 0; byte
< VECTOR_SIZE
;) {
225 for (size_t length
= 0; length
< interval
; length
++) {
226 initial_bitslices
[bit
].bytes
[byte
++] = 0x00;
228 for (size_t length
= 0; length
< interval
; length
++) {
229 initial_bitslices
[bit
].bytes
[byte
++] = 0xff;
235 // compute layer 0 output
236 for (size_t i0
= 0; i0
< 1 << 20; i0
++) {
237 uint64_t state0
= expand(0x5806b4a2d16c, i0
);
239 if (f(state0
) == target
>> 31) {
240 // cf kernel, state is now split in 3 shorts >> 2
241 candidates
[(layer_0_found
* 3) + 0] = (uint16_t)((state0
>> (32 + 2)) & 0xffff);
242 candidates
[(layer_0_found
* 3) + 1] = (uint16_t)((state0
>> (16 + 2)) & 0xffff);
243 candidates
[(layer_0_found
* 3) + 2] = (uint16_t)((state0
>> (0 + 2)) & 0xffff);
248 // load OpenCL kernel source
249 ////////////////////////////
250 struct stat filestat
;
253 fd
= open(KERNELFILENAME
, O_RDONLY
);
255 printf("Cannot open %s\n", KERNELFILENAME
);
259 if (fstat(fd
, &filestat
)) {
260 printf("Cannot stat %s\n", KERNELFILENAME
);
264 ctx
.kernelSource
= (char *)malloc(filestat
.st_size
);
265 if (!ctx
.kernelSource
) {
266 printf("Cannot malloc kernelSource\n");
270 if (read(fd
, ctx
.kernelSource
, filestat
.st_size
) < filestat
.st_size
) {
271 printf("Cannot read %s\n", KERNELFILENAME
);
277 // discover and set up compute device
278 /////////////////////////////////////
281 // Connect to a compute device
282 err
= clGetPlatformIDs(1, &(ctx
.platform_id
), NULL
);
283 if (err
!= CL_SUCCESS
) {
284 printf("Error: Failed to get platform id: %d\n", err
);
288 err
= clGetDeviceIDs(ctx
.platform_id
, CL_DEVICE_TYPE_GPU
, 1, &(ctx
.device_id
), NULL
);
289 if (err
!= CL_SUCCESS
) {
290 printf("Error: Failed to create a device group!: %d\n", err
);
294 // Create a compute context
295 ctx
.context
= clCreateContext(0, 1, &(ctx
.device_id
), NULL
, NULL
, &err
);
297 printf("Error: Failed to create a compute context!\n");
301 // Create a command commands
302 ctx
.commands
= clCreateCommandQueue(ctx
.context
, ctx
.device_id
, 0, &err
);
304 printf("Error: Failed to create a command commands!\n");
308 // Create the compute program from the source buffer
309 ctx
.program
= clCreateProgramWithSource(ctx
.context
, 1, (const char **) & (ctx
.kernelSource
), NULL
, &err
);
311 printf("Error: Failed to create compute program!\n");
315 // Build the program executable
316 err
= clBuildProgram(ctx
.program
, 0, NULL
, "-Werror", NULL
, NULL
);
318 if (err
!= CL_SUCCESS
) {
320 char buffer
[1024 * 1024];
322 printf("Error: Failed to build program executable!\n");
323 err
= clGetProgramBuildInfo(ctx
.program
, ctx
.device_id
, CL_PROGRAM_BUILD_LOG
, sizeof(buffer
), buffer
, &len
);
324 if (err
!= CL_SUCCESS
) {
325 printf("clGetProgramBuildInfo failed: %d\n", err
);
328 printf("%s\n", buffer
);
333 // Create the compute kernel in the program we wish to run
334 ctx
.kernel
= clCreateKernel(ctx
.program
, "find_state", &err
);
335 if (!ctx
.kernel
|| err
!= CL_SUCCESS
) {
336 printf("Error: Failed to create compute kernel!\n");
340 ctx
.candidates
= clCreateBuffer(ctx
.context
, CL_MEM_READ_ONLY
, sizeof(uint16_t) * ((1 << 20) * 3), NULL
, NULL
);
341 ctx
.keystream
= clCreateBuffer(ctx
.context
, CL_MEM_READ_ONLY
, VECTOR_SIZE
* 32, NULL
, NULL
);
343 ctx
.matches
= clCreateBuffer(ctx
.context
, CL_MEM_WRITE_ONLY
, sizeof(uint64_t) * 8192, NULL
, NULL
);
344 ctx
.matches_found
= clCreateBuffer(ctx
.context
, CL_MEM_READ_WRITE
, sizeof(uint32_t), NULL
, NULL
);
346 if (!ctx
.candidates
|| !ctx
.keystream
|| !ctx
.matches
|| !ctx
.matches_found
) {
347 printf("Error: Failed to allocate device memory!\n");
351 // set up constant vars
352 ///////////////////////
354 // Write our data set into the input array in device memory
355 err
= clEnqueueWriteBuffer(ctx
.commands
, ctx
.keystream
, CL_TRUE
, 0, VECTOR_SIZE
* 32, keystream
, 0, NULL
, NULL
);
356 if (err
!= CL_SUCCESS
) {
357 printf("Error: Failed to write to keystream array!\n");
361 err
= clEnqueueWriteBuffer(ctx
.commands
, ctx
.candidates
, CL_TRUE
, 0, sizeof(uint16_t) * ((1 << 20) * 3), candidates
, 0, NULL
, NULL
);
362 if (err
!= CL_SUCCESS
) {
363 printf("Error: Failed to write to candidates array!\n");
367 // Set the arguments to our compute kernel
368 err
= clSetKernelArg(ctx
.kernel
, 1, sizeof(cl_mem
), &ctx
.candidates
);
369 err
|= clSetKernelArg(ctx
.kernel
, 2, sizeof(cl_mem
), &ctx
.keystream
);
370 err
|= clSetKernelArg(ctx
.kernel
, 3, sizeof(cl_mem
), &ctx
.matches
);
371 if (err
!= CL_SUCCESS
) {
372 printf("Error: Failed to set kernel arguments! %d\n", err
);
378 for (uint32_t step
= 0; step
< 64; step
++) {
379 printf("slice %3u/64: ", step
+ 1);
381 matches_found
[0] = 0;
382 runKernel(&ctx
, step
<< 13, matches
, matches_found
);
384 printf("%5u candidates\n", matches_found
[0]);
385 for (uint32_t match
= 0; match
< matches_found
[0]; match
++) {
386 try_state(matches
[match
]);
390 printf("Key not found\n");
394 static void try_state(uint64_t s
) {
396 uint64_t keyrev
, nR1xk
;
400 rollback(&hstate
, 2);
403 keyrev
= hstate
.shiftreg
& 0xffff;
404 nR1xk
= (hstate
.shiftreg
>> 16) & 0xffffffff;
405 for (int i
= 0; i
< 32; i
++) {
406 hstate
.shiftreg
= ((hstate
.shiftreg
) << 1) | ((uid
>> (31 - i
)) & 0x1);
407 b
= (b
<< 1) | fnf(hstate
.shiftreg
);
409 keyrev
|= (nR1xk
^ nR1
^ b
) << 16;
412 hitag2_init(&hstate
, keyrev
, uid
, nR2
);
413 if ((aR2
^ hitag2_nstep(&hstate
, 32)) == 0xffffffff) {
415 uint64_t key
= rev64(keyrev
);
418 for (int i
= 0; i
< 6; i
++) {
419 printf("%02X", (uint8_t)(key
& 0xff));