added more keys (@equipter)
[RRG-proxmark3.git] / tools / hitag2crack / crack5gpu / ht2crack5gpu.c
blob9074d8bd115c27de22b3830e8f6fbe8ef78d8055
1 /* ht2crack5.c
3 * This code is heavily based on the HiTag2 Hell CPU implementation
4 * from https://github.com/factoritbv/hitag2hell by FactorIT B.V.,
5 * with the following changes:
6 * * Main takes a UID and 2 {nR},{aR} pairs as arguments
7 * and searches for states producing the first aR sample,
8 * reconstructs the corresponding key candidates
9 * and tests them against the second nR,aR pair;
10 * * Reduce max_bitslices and some type sizes to fit OpenCL
11 * * Reuses the Hitag helping functions of the other attacks.
14 #include <stdint.h>
15 #include <stdbool.h>
16 #include <stdio.h>
17 #include <string.h>
18 #include <unistd.h>
19 #include <stdlib.h>
20 #include <inttypes.h>
21 #include <pthread.h>
22 #include <fcntl.h>
23 #include <sys/types.h>
24 #include <sys/stat.h>
25 #ifdef __APPLE__
26 #include <OpenCL/opencl.h>
27 #else
28 #define CL_TARGET_OPENCL_VERSION 220
29 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
30 #include <CL/cl.h>
31 #endif
32 #include "ht2crackutils.h"
34 const uint8_t bits[9] = {20, 14, 4, 3, 1, 1, 1, 1, 1};
35 #define lfsr_inv(state) (((state)<<1) | (__builtin_parityll((state) & ((0xce0044c101cd>>1)|(1ull<<(47))))))
36 #define i4(x,a,b,c,d) ((uint32_t)((((x)>>(a))&1)<<3)|(((x)>>(b))&1)<<2|(((x)>>(c))&1)<<1|(((x)>>(d))&1))
37 #define f(state) ((0xdd3929b >> ( (((0x3c65 >> i4(state, 2, 3, 5, 6) ) & 1) <<4) \
38 | ((( 0xee5 >> i4(state, 8,12,14,15) ) & 1) <<3) \
39 | ((( 0xee5 >> i4(state,17,21,23,26) ) & 1) <<2) \
40 | ((( 0xee5 >> i4(state,28,29,31,33) ) & 1) <<1) \
41 | (((0x3c65 >> i4(state,34,43,44,46) ) & 1) ))) & 1)
43 #define MAX_BITSLICES 32
44 #define VECTOR_SIZE (MAX_BITSLICES/8)
45 #define KERNELFILENAME "ht2crack5kernel.cl"
47 typedef unsigned int __attribute__((aligned(VECTOR_SIZE))) __attribute__((vector_size(VECTOR_SIZE))) bitslice_value_t;
48 typedef union {
49 bitslice_value_t value;
50 uint8_t bytes[MAX_BITSLICES / 8];
51 } bitslice_t;
53 // we never actually set or use the lowest 2 bits the initial state, so we can save 2 bitslices everywhere
54 __thread bitslice_t state[-2 + 32 + 48];
56 bitslice_t keystream[32];
57 bitslice_t bs_zeroes, bs_ones;
59 #define f_a_bs(a,b,c,d) (~(((a|b)&c)^(a|d)^b)) // 6 ops
60 #define f_b_bs(a,b,c,d) (~(((d|c)&(a^b))^(d|a|b))) // 7 ops
61 #define f_c_bs(a,b,c,d,e) (~((((((c^e)|d)&a)^b)&(c^b))^(((d^e)|a)&((d^b)|c)))) // 13 ops
62 #define lfsr_bs(i) (state[-2+i+ 0].value ^ state[-2+i+ 2].value ^ state[-2+i+ 3].value ^ state[-2+i+ 6].value ^ \
63 state[-2+i+ 7].value ^ state[-2+i+ 8].value ^ state[-2+i+16].value ^ state[-2+i+22].value ^ \
64 state[-2+i+23].value ^ state[-2+i+26].value ^ state[-2+i+30].value ^ state[-2+i+41].value ^ \
65 state[-2+i+42].value ^ state[-2+i+43].value ^ state[-2+i+46].value ^ state[-2+i+47].value);
66 #define get_bit(n, word) ((word >> (n)) & 1)
68 static uint64_t expand(uint64_t mask, uint64_t value) {
69 uint64_t fill = 0;
70 for (uint64_t bit_index = 0; bit_index < 48; bit_index++) {
71 if (mask & 1) {
72 fill |= (value & 1) << bit_index;
73 value >>= 1;
75 mask >>= 1;
77 return fill;
80 static void bitslice(const uint64_t value, bitslice_t *restrict bitsliced_value, const size_t bit_len, bool reverse) {
81 size_t bit_idx;
82 for (bit_idx = 0; bit_idx < bit_len; bit_idx++) {
83 bool bit;
84 if (reverse) {
85 bit = get_bit(bit_len - 1 - bit_idx, value);
86 } else {
87 bit = get_bit(bit_idx, value);
89 if (bit) {
90 bitsliced_value[bit_idx].value = bs_ones.value;
91 } else {
92 bitsliced_value[bit_idx].value = bs_zeroes.value;
97 uint32_t uid, nR1, aR1, nR2, aR2;
99 // Reduce type size of candidates array to fit OpenCL
100 uint16_t candidates[(1 << 20) * 3];
101 bitslice_t initial_bitslices[48];
102 size_t filter_pos[20] = {4, 7, 9, 13, 16, 18, 22, 24, 27, 30, 32, 35, 45, 47 };
103 size_t thread_count = 8;
104 size_t layer_0_found;
106 static void try_state(uint64_t s);
108 struct context {
109 char *kernelSource; // source for kernel
111 cl_platform_id platform_id; // compute platform id
112 cl_device_id device_id; // compute device id
113 cl_context context; // compute context
114 cl_command_queue commands; // compute command queue
115 cl_program program; // compute program
116 cl_kernel kernel; // compute kernel
118 // cl_mem cand_base; // device memory used for the candidate base
119 cl_mem keystream; // device memory used for the keystream array
120 cl_mem candidates; // device memory used for the candidates array
121 cl_mem matches; // device memory used for the matches array
122 cl_mem matches_found; // device memory used for the matches_found array
126 static void runKernel(struct context *ctx, uint32_t cand_base, uint64_t *matches, uint32_t *matches_found) {
127 int err;
128 size_t global[2];
130 // Write our data set into the input array in device memory
131 err = clEnqueueWriteBuffer(ctx->commands, ctx->matches_found, CL_TRUE, 0, sizeof(uint32_t), matches_found, 0, NULL, NULL);
132 if (err != CL_SUCCESS) {
133 printf("Error: Failed to enque kernel writebuffer in runKernel! %d\n", err);
134 exit(1);
137 // Set the arguments to our compute kernel
138 err = clSetKernelArg(ctx->kernel, 0, sizeof(uint32_t), &cand_base);
139 err |= clSetKernelArg(ctx->kernel, 4, sizeof(cl_mem), &ctx->matches_found);
140 if (err != CL_SUCCESS) {
141 printf("Error: Failed to set kernel arguments in runKernel! %d\n", err);
142 exit(1);
145 // Execute the kernel over the entire range of our 2d input data set using 8K * 1K threads
146 global[0] = 8192;
147 global[1] = 1024;
148 err = clEnqueueNDRangeKernel(ctx->commands, ctx->kernel, 2, NULL, global, NULL, 0, NULL, NULL);
149 if (err) {
150 printf("Error: Failed to execute kernel!\n");
151 exit(1);
154 // Wait for the command commands to get serviced before reading back results
155 err = clFinish(ctx->commands);
156 if (err) {
157 printf("Error: Failed to execute kernel! clFinish = %d\n", err);
158 exit(1);
161 // Read back the results from the device to verify the output
162 err = clEnqueueReadBuffer(ctx->commands, ctx->matches, CL_TRUE, 0, sizeof(uint64_t) * 8192, matches, 0, NULL, NULL);
163 if (err != CL_SUCCESS) {
164 printf("Error: Failed to read matches array! %d\n", err);
165 exit(1);
168 err = clEnqueueReadBuffer(ctx->commands, ctx->matches_found, CL_TRUE, 0, sizeof(uint32_t), matches_found, 0, NULL, NULL);
169 if (err != CL_SUCCESS) {
170 printf("Error: Failed to read matches_found! %d\n", err);
171 exit(1);
175 int main(int argc, char *argv[]) {
176 memset(candidates, 0, sizeof(candidates));
177 struct context ctx;
178 uint64_t matches[8192];
179 uint32_t matches_found[1];
181 // set constants
182 memset(bs_ones.bytes, 0xff, VECTOR_SIZE);
183 memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE);
185 uint32_t target = 0;
187 if (argc < 6) {
188 printf("%s UID {nR1} {aR1} {nR2} {aR2}\n", argv[0]);
189 exit(1);
192 if (!strncmp(argv[1], "0x", 2) || !strncmp(argv[1], "0X", 2)) {
193 uid = rev32(hexreversetoulong(argv[1] + 2));
194 } else {
195 uid = rev32(hexreversetoulong(argv[1]));
198 if (!strncmp(argv[2], "0x", 2) || !strncmp(argv[2], "0X", 2)) {
199 nR1 = rev32(hexreversetoulong(argv[2] + 2));
200 } else {
201 nR1 = rev32(hexreversetoulong(argv[2]));
204 aR1 = strtol(argv[3], NULL, 16);
206 if (!strncmp(argv[4], "0x", 2) || !strncmp(argv[4], "0X", 2)) {
207 nR2 = rev32(hexreversetoulong(argv[4] + 2));
208 } else {
209 nR2 = rev32(hexreversetoulong(argv[4]));
212 aR2 = strtol(argv[5], NULL, 16);
214 target = ~aR1;
215 // bitslice inverse target bits
216 bitslice(~target, keystream, 32, true);
218 // bitslice all possible 256 values in the lowest 8 bits
219 memset(initial_bitslices[0].bytes, 0xaa, VECTOR_SIZE);
220 memset(initial_bitslices[1].bytes, 0xcc, VECTOR_SIZE);
221 memset(initial_bitslices[2].bytes, 0xf0, VECTOR_SIZE);
222 size_t interval = 1;
223 for (size_t bit = 3; bit < 8; bit++) {
224 for (size_t byte = 0; byte < VECTOR_SIZE;) {
225 for (size_t length = 0; length < interval; length++) {
226 initial_bitslices[bit].bytes[byte++] = 0x00;
228 for (size_t length = 0; length < interval; length++) {
229 initial_bitslices[bit].bytes[byte++] = 0xff;
232 interval <<= 1;
235 // compute layer 0 output
236 for (size_t i0 = 0; i0 < 1 << 20; i0++) {
237 uint64_t state0 = expand(0x5806b4a2d16c, i0);
239 if (f(state0) == target >> 31) {
240 // cf kernel, state is now split in 3 shorts >> 2
241 candidates[(layer_0_found * 3) + 0] = (uint16_t)((state0 >> (32 + 2)) & 0xffff);
242 candidates[(layer_0_found * 3) + 1] = (uint16_t)((state0 >> (16 + 2)) & 0xffff);
243 candidates[(layer_0_found * 3) + 2] = (uint16_t)((state0 >> (0 + 2)) & 0xffff);
244 layer_0_found++;
248 // load OpenCL kernel source
249 ////////////////////////////
250 struct stat filestat;
251 int fd;
253 fd = open(KERNELFILENAME, O_RDONLY);
254 if (fd <= 0) {
255 printf("Cannot open %s\n", KERNELFILENAME);
256 exit(1);
259 if (fstat(fd, &filestat)) {
260 printf("Cannot stat %s\n", KERNELFILENAME);
261 exit(1);
264 ctx.kernelSource = (char *)malloc(filestat.st_size);
265 if (!ctx.kernelSource) {
266 printf("Cannot malloc kernelSource\n");
267 exit(1);
270 if (read(fd, ctx.kernelSource, filestat.st_size) < filestat.st_size) {
271 printf("Cannot read %s\n", KERNELFILENAME);
272 exit(1);
275 close(fd);
277 // discover and set up compute device
278 /////////////////////////////////////
279 int err;
281 // Connect to a compute device
282 err = clGetPlatformIDs(1, &(ctx.platform_id), NULL);
283 if (err != CL_SUCCESS) {
284 printf("Error: Failed to get platform id: %d\n", err);
285 exit(1);
288 err = clGetDeviceIDs(ctx.platform_id, CL_DEVICE_TYPE_GPU, 1, &(ctx.device_id), NULL);
289 if (err != CL_SUCCESS) {
290 printf("Error: Failed to create a device group!: %d\n", err);
291 exit(1);
294 // Create a compute context
295 ctx.context = clCreateContext(0, 1, &(ctx.device_id), NULL, NULL, &err);
296 if (!ctx.context) {
297 printf("Error: Failed to create a compute context!\n");
298 exit(1);
301 // Create a command commands
302 ctx.commands = clCreateCommandQueue(ctx.context, ctx.device_id, 0, &err);
303 if (!ctx.commands) {
304 printf("Error: Failed to create a command commands!\n");
305 exit(1);
308 // Create the compute program from the source buffer
309 ctx.program = clCreateProgramWithSource(ctx.context, 1, (const char **) & (ctx.kernelSource), NULL, &err);
310 if (!ctx.program) {
311 printf("Error: Failed to create compute program!\n");
312 exit(1);
315 // Build the program executable
316 err = clBuildProgram(ctx.program, 0, NULL, "-Werror", NULL, NULL);
318 if (err != CL_SUCCESS) {
319 size_t len;
320 char buffer[1024 * 1024];
322 printf("Error: Failed to build program executable!\n");
323 err = clGetProgramBuildInfo(ctx.program, ctx.device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
324 if (err != CL_SUCCESS) {
325 printf("clGetProgramBuildInfo failed: %d\n", err);
326 exit(1);
327 } else {
328 printf("%s\n", buffer);
329 exit(1);
333 // Create the compute kernel in the program we wish to run
334 ctx.kernel = clCreateKernel(ctx.program, "find_state", &err);
335 if (!ctx.kernel || err != CL_SUCCESS) {
336 printf("Error: Failed to create compute kernel!\n");
337 exit(1);
340 ctx.candidates = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY, sizeof(uint16_t) * ((1 << 20) * 3), NULL, NULL);
341 ctx.keystream = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY, VECTOR_SIZE * 32, NULL, NULL);
343 ctx.matches = clCreateBuffer(ctx.context, CL_MEM_WRITE_ONLY, sizeof(uint64_t) * 8192, NULL, NULL);
344 ctx.matches_found = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, sizeof(uint32_t), NULL, NULL);
346 if (!ctx.candidates || !ctx.keystream || !ctx.matches || !ctx.matches_found) {
347 printf("Error: Failed to allocate device memory!\n");
348 exit(1);
351 // set up constant vars
352 ///////////////////////
354 // Write our data set into the input array in device memory
355 err = clEnqueueWriteBuffer(ctx.commands, ctx.keystream, CL_TRUE, 0, VECTOR_SIZE * 32, keystream, 0, NULL, NULL);
356 if (err != CL_SUCCESS) {
357 printf("Error: Failed to write to keystream array!\n");
358 exit(1);
361 err = clEnqueueWriteBuffer(ctx.commands, ctx.candidates, CL_TRUE, 0, sizeof(uint16_t) * ((1 << 20) * 3), candidates, 0, NULL, NULL);
362 if (err != CL_SUCCESS) {
363 printf("Error: Failed to write to candidates array!\n");
364 exit(1);
367 // Set the arguments to our compute kernel
368 err = clSetKernelArg(ctx.kernel, 1, sizeof(cl_mem), &ctx.candidates);
369 err |= clSetKernelArg(ctx.kernel, 2, sizeof(cl_mem), &ctx.keystream);
370 err |= clSetKernelArg(ctx.kernel, 3, sizeof(cl_mem), &ctx.matches);
371 if (err != CL_SUCCESS) {
372 printf("Error: Failed to set kernel arguments! %d\n", err);
373 exit(1);
376 // run kernel
377 /////////////
378 for (uint32_t step = 0; step < 64; step++) {
379 printf("slice %3u/64: ", step + 1);
380 fflush(stdout);
381 matches_found[0] = 0;
382 runKernel(&ctx, step << 13, matches, matches_found);
384 printf("%5u candidates\n", matches_found[0]);
385 for (uint32_t match = 0; match < matches_found[0]; match++) {
386 try_state(matches[match]);
390 printf("Key not found\n");
391 exit(1);
394 static void try_state(uint64_t s) {
395 Hitag_State hstate;
396 uint64_t keyrev, nR1xk;
397 uint32_t b = 0;
399 hstate.shiftreg = s;
400 rollback(&hstate, 2);
402 // recover key
403 keyrev = hstate.shiftreg & 0xffff;
404 nR1xk = (hstate.shiftreg >> 16) & 0xffffffff;
405 for (int i = 0; i < 32; i++) {
406 hstate.shiftreg = ((hstate.shiftreg) << 1) | ((uid >> (31 - i)) & 0x1);
407 b = (b << 1) | fnf(hstate.shiftreg);
409 keyrev |= (nR1xk ^ nR1 ^ b) << 16;
411 // test key
412 hitag2_init(&hstate, keyrev, uid, nR2);
413 if ((aR2 ^ hitag2_nstep(&hstate, 32)) == 0xffffffff) {
415 uint64_t key = rev64(keyrev);
417 printf("Key: ");
418 for (int i = 0; i < 6; i++) {
419 printf("%02X", (uint8_t)(key & 0xff));
420 key = key >> 8;
422 printf("\n");
423 exit(0);