2 * This code is heavily based on the crack5gpu.
4 * Additional changes done by Gabriele 'matrix' Gristina <gabriele.gristina@gmail.com>
5 * - Using optimized OpenCL kernel (ht2crack5opencl_kernel.cl)
6 * - Rewriting OpenCL host code
7 * - Add OpenCL Platforms/Devices enumeration, used to selectively enable kernel optimizations
8 * - Support Multi-Platform (GPU & CPU), using custom async or sequential thread engine, and queue
9 * - Reduce memory read from OpenCL device to host (for each iteration only the exact number of candidates are read, instead of a big buffer)
10 * - Support 'Computing Profiles', to fine-tune workloads based on available resources
11 * - Support HiTag2 Key check on device.
12 * In this case reduce a lot the memory in use but but it loses on performance ~1 sec
13 * (with GeForce GTX 1080 Ti, 70.449128 vs 71.062680 (Slice 4043/4096))
25 #include <sys/types.h>
31 #include "ht2crack5opencl.h"
36 #include "dolphin_macro.h"
39 #define AEND "\x1b[0m"
40 #define _RED_(s) "\x1b[31m" s AEND
41 #define _GREEN_(s) "\x1b[32m" s AEND
42 #define _YELLOW_(s) "\x1b[33m" s AEND
43 #define _CYAN_(s) "\x1b[36m" s AEND
45 #if defined(__MINGW64__)
46 #define timersub(a, b, result) \
48 (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
49 (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
50 if ((result)->tv_usec < 0) { \
52 (result)->tv_usec += 1000000;\
57 #define MAX_BITSLICES 32
58 #define VECTOR_SIZE (MAX_BITSLICES/8)
60 typedef unsigned int __attribute__((aligned(VECTOR_SIZE
))) __attribute__((vector_size(VECTOR_SIZE
))) bitslice_value_t
;
62 bitslice_value_t value
;
63 uint8_t bytes
[VECTOR_SIZE
];
66 static bitslice_t keystream
[32];
68 //uint64_t candidates[(1 << 20)];
69 // Reduce type size of candidates array to fit OpenCL
70 static uint16_t candidates
[(1 << 20) * 3];
72 // compute profile constants, from low to high workloads
73 static unsigned int profiles
[11][2] = {
74 { 16384, 5 }, // 0, best for Intel GPU's with Neo
75 { 8192, 6 }, // 1, only for Intel NEO
76 { 4096, 7 }, // 2 (old 0) seems the best for all others (also NVIDIA) :D Apple/Intel GPU's stable here
77 { 2048, 8 }, // 3 (old 1) usefulfor any kind of CPU's
81 { 128, 12 }, // 7, (old 5) the last good value with NVIDIA GPU's
87 static uint64_t expand(uint64_t mask
, uint64_t value
) {
90 for (uint64_t bit_index
= 0; bit_index
< 48; bit_index
++) {
92 fill
|= (value
& 1) << bit_index
;
101 static void bitslice(const uint64_t value
, bitslice_t
*restrict bitsliced_value
) {
103 bitslice_t bs_zeroes
, bs_ones
;
104 memset((uint8_t *)&bs_ones
, 0xff, VECTOR_SIZE
);
105 memset((uint8_t *)&bs_zeroes
, 0x00, VECTOR_SIZE
);
107 for (size_t bit_idx
= 0; bit_idx
< 32; bit_idx
++) {
108 const bool bit
= get_bit(32 - 1 - bit_idx
, value
);
109 bitsliced_value
[bit_idx
].value
= (bit
) ? bs_ones
.value
: bs_zeroes
.value
;
113 // convert byte-reversed 8 digit hex to unsigned long
114 static unsigned long hexreversetoulong(char *hex
) {
115 unsigned long ret
= 0L;
119 if (strlen(hex
) != 8)
122 for (i
= 0 ; i
< 4 ; ++i
) {
123 if (sscanf(hex
, "%2X", &x
) != 1)
125 ret
+= ((unsigned long) x
) << i
* 8;
132 static const char *emoj
[3][2] = { {"∩", "つ"}, {"つ", "⊃"}, {"⊃", "੭ " } };
135 __attribute__((noreturn
))
136 static void usage(char *name
) {
137 printf("%s [options] {UID} {nR1} {aR1} {nR2} {aR2}\n\n" \
139 "-p : select OpenCL Platform(s). Multiple allowed (1,2,3,etc.). [Default: all]\n"
140 "-d : select OpenCL Device(s). Multiple allowed (1,2,3,etc.). [Default: all]\n"
141 "-D : select OpenCL Device Type. 0: GPU, 1: CPU, 2: all. [Default: GPU]\n"
142 "-S : select the thread scheduler type. 0: sequential, 1: asynchronous. [Default 1]\n"
143 "-P : select the Profile, from 0 to 10. [Default: auto-tuning]\n"
144 "-F : force verify key with OpenCL instead of CPU. [Default: disabled]\n"
145 "-Q : select queue engine. 0: forward, 1: reverse, 2: random. [Default: 0]\n"
146 "-s : show the list of OpenCL platforms/devices, then exit\n"
147 "-V : enable debug messages\n"
148 "-v : show the version\n"
149 "-h : show this help\n\n", name
);
151 printf("Example, select devices 1, 2 and 3 using platform 1 and 2, with random queue engine:\n\n"
152 "%s -D 2 -Q 2 -p 1,2 -d 1,2,3 2ab12bf2 4B71E49D 6A606453 D79BD94B 16A2255B\n\n", name
);
157 static bool parse_arg(char *restrict in
, unsigned int *out
, unsigned int *out_cnt
, const int opt_type
) {
158 unsigned int cnt
= 0;
164 if (strchr(in
, ',')) {
166 char *saveptr
= NULL
;
167 char *next
= strtok_r(in
, ",", &saveptr
);
170 unsigned int tmp_sel
= (unsigned int) strtoul(next
, NULL
, 10);
171 if (errno
== EINVAL
|| errno
== ERANGE
||
172 (tmp_sel
< 1 || tmp_sel
> 16)) {
173 printf("Error: invalid %s argument\n", (opt_type
== 0) ? "'platform'" : "'device'");
177 out
[cnt
++] = tmp_sel
;
179 } while ((next
= strtok_r(NULL
, ",", &saveptr
)) != NULL
);
185 out
[0] = (unsigned int) strtoul(in
, NULL
, 10);
186 if (errno
== EINVAL
|| errno
== ERANGE
) {
187 printf("Error: invalid %s argument\n", (opt_type
== 0) ? "'platform'" : "'device'");
197 int main(int argc
, char **argv
) {
200 uint32_t uid
= 0, nR1
= 0, aR1
= 0, nR2
= 0, aR2
= 0;
201 bool opencl_profiling
= 0;
202 bool force_hitag2_opencl
= false;
203 bool verbose
= false;
206 char *platforms_selected
= NULL
;
207 char *devices_selected
= NULL
;
208 unsigned int device_types_selected
= 0;
209 unsigned int thread_scheduler_type_selected
= THREAD_TYPE_ASYNC
;
210 unsigned int profile_selected
= 2;
211 unsigned int queue_type
= 0;
213 uint32_t **matches_found
= NULL
;
214 uint64_t **matches
= NULL
;
218 while ((opt
= getopt(argc
, argv
, "p:d:D:S:P:F:Q:svVh")) != -1) {
222 platforms_selected
= strdup(optarg
);
226 devices_selected
= strdup(optarg
);
229 // 0: gpu, 1: cpu, 2: all
230 device_types_selected
= (unsigned int) strtoul(optarg
, NULL
, 10);
231 if (device_types_selected
> 2) {
232 printf("Error: invalid DEVICE TYPE argument (accepted values: from 0 to 2)\n");
237 // 0: sequential, 1: async
238 thread_scheduler_type_selected
= (unsigned int) strtoul(optarg
, NULL
, 10);
241 profile_selected
= (unsigned int) strtoul(optarg
, NULL
, 10);
242 if (profile_selected
> 10) {
243 printf("Error: invalid PROFILE argument (accepted valuee: from 0 to 10)\n");
248 force_hitag2_opencl
= true;
251 // 0: forward, 1: reverse, 2: random
252 queue_type
= (unsigned int) strtoul(optarg
, NULL
, 10);
253 if (queue_type
!= QUEUE_TYPE_FORWARD
&& queue_type
!= QUEUE_TYPE_REVERSE
&& queue_type
!= QUEUE_TYPE_RANDOM
) {
254 printf("Error: invalid QUEUE TYPE argument (accepted values: 0, 1 or 2)\n");
265 printf("Version: %s\n", VERSION
);
273 unsigned int plat_sel
[16] = { 0 };
274 unsigned int plat_cnt
= 0;
275 unsigned int dev_sel
[16] = { 0 };
276 unsigned int dev_cnt
= 0;
278 if (!parse_arg(platforms_selected
, plat_sel
, &plat_cnt
, 0)) {
279 free(platforms_selected
);
283 if (!parse_arg(devices_selected
, dev_sel
, &dev_cnt
, 1)) {
284 free(platforms_selected
);
285 free(devices_selected
);
289 free(platforms_selected
);
290 free(devices_selected
);
292 if (device_types_selected
== 0) {
293 device_types_selected
= CL_DEVICE_TYPE_GPU
;
294 } else if (device_types_selected
== 1) {
295 device_types_selected
= CL_DEVICE_TYPE_CPU
;
297 device_types_selected
= CL_DEVICE_TYPE_ALL
;
303 device_types_selected
= CL_DEVICE_TYPE_ALL
;
307 if (plat_sel
[0] == 0xff) {
308 printf("Platforms selected : ALL\n");
310 printf("Platforms selected : %u", plat_sel
[0]);
311 for (unsigned int i
= 1; i
< plat_cnt
; i
++) {
312 printf(", %u", plat_sel
[i
]);
317 if (dev_sel
[0] == 0xff) {
318 printf("Devices selected : ALL\n");
320 printf("Devices selected : %u", dev_sel
[0]);
321 for (unsigned int i
= 1; i
< dev_cnt
; i
++) {
322 printf(", %u", dev_sel
[i
]);
327 printf("Device types selected : %s\n", (device_types_selected
== CL_DEVICE_TYPE_GPU
) ? "GPU" : (device_types_selected
== CL_DEVICE_TYPE_CPU
) ? "CPU" : "ALL");
328 printf("Scheduler selected : %s\n", (thread_scheduler_type_selected
== 0) ? "sequential" : "async");
329 printf("Profile selected : %u\n", profile_selected
);
334 if ((argc
- optind
) < 5) {
336 printf("Error: invalid extra arguments\n");
341 for (int e
= 0; e
< 5; optind
++, e
++) {
344 if (!strncmp(argv
[optind
], "0x", 2) || !strncmp(argv
[optind
], "0X", 2)) {
345 if (strlen(argv
[optind
]) != 2 + 8) {
346 printf("Error: invalid UID length\n");
349 uid
= (uint32_t) rev32(hexreversetoulong(argv
[optind
] + 2));
351 if (strlen(argv
[optind
]) != 8) {
352 printf("Error: invalid UID length\n");
355 uid
= (uint32_t) rev32(hexreversetoulong(argv
[optind
]));
360 if (!strncmp(argv
[optind
], "0x", 2) || !strncmp(argv
[optind
], "0X", 2)) {
361 if (strlen(argv
[optind
]) != 2 + 8) {
362 printf("Error: invalid nR1 length\n");
365 nR1
= (uint32_t) rev32(hexreversetoulong(argv
[optind
] + 2));
367 if (strlen(argv
[optind
]) != 8) {
368 printf("Error: invalid nR1 length\n");
371 nR1
= (uint32_t) rev32(hexreversetoulong(argv
[optind
]));
376 if (strlen(argv
[optind
]) != 8) {
377 printf("Error: invalid aR1 length\n");
380 aR1
= (uint32_t) strtoul(argv
[optind
], NULL
, 16);
384 if (!strncmp(argv
[optind
], "0x", 2) || !strncmp(argv
[optind
], "0X", 2)) {
385 if (strlen(argv
[optind
]) != 2 + 8) {
386 printf("Error: invalid nR2 length\n");
389 nR2
= (uint32_t) rev32(hexreversetoulong(argv
[optind
] + 2));
391 if (strlen(argv
[optind
]) != 8) {
392 printf("Error: invalid nR2 length\n");
395 nR2
= (uint32_t) rev32(hexreversetoulong(argv
[optind
]));
400 if (strlen(argv
[optind
]) != 8) {
401 printf("Error: invalid aR2 length\n");
404 aR2
= (uint32_t) strtoul(argv
[optind
], NULL
, 16);
407 default: // skip invalid instead of show usage and exit
414 memset(&ctx
, 0, sizeof(opencl_ctx_t
));
415 memset(keystream
, 0, sizeof(keystream
));
416 memset(candidates
, 0, sizeof(candidates
));
418 ctx
.profiling
= opencl_profiling
;
419 ctx
.thread_sched_type
= (short) thread_scheduler_type_selected
;
420 ctx
.force_hitag2_opencl
= force_hitag2_opencl
;
422 uint32_t checks
[4] = { uid
, aR2
, nR1
, nR2
};
426 printf("uid: %u, aR2: %u, nR1: %u, nR2: %u\n", checks
[0], checks
[1], checks
[2], checks
[3]);
429 uint32_t target
= ~aR1
;
430 // bitslice inverse target bits
431 bitslice(~target
, keystream
);
433 size_t layer_0_found
= 0;
435 // compute layer 0 output
436 for (size_t i0
= 0; i0
< 1 << 20; i0
++) {
437 uint64_t state0
= expand(0x5806b4a2d16c, i0
);
439 if (f(state0
) == target
>> 31) {
441 // candidates[layer_0_found++] = state0;
443 // cf kernel, state is now split in 3 shorts >> 2
444 candidates
[(layer_0_found
* 3) + 0] = (uint16_t)((state0
>> (32 + 2)) & 0xffff);
445 candidates
[(layer_0_found
* 3) + 1] = (uint16_t)((state0
>> (16 + 2)) & 0xffff);
446 candidates
[(layer_0_found
* 3) + 2] = (uint16_t)((state0
>> (0 + 2)) & 0xffff);
452 printf("[debug] layer_0_found: %zu\n", layer_0_found
);
456 // powered by dolphin's macros :)
459 // todo, calculate the max number of allocations to remove 0x40
460 void **freeList
= (void **) calloc(1, 0x40 * sizeof(void *));
462 printf("Error: calloc (freeList) failed (%d): %s\n", errno
, strerror(errno
));
467 // load OpenCL kernel source
469 const char *opencl_kernel
= "ht2crack5opencl_kernel.cl";
471 int fd
= open(opencl_kernel
, O_RDONLY
);
473 printf("Error: open (%s) failed (%d): %s\n", opencl_kernel
, errno
, strerror(errno
));
477 if (fstat(fd
, &st
)) {
478 printf("Error: stat (%s) failed (%d): %s\n", opencl_kernel
, errno
, strerror(errno
));
483 ctx
.kernelSource_len
= (size_t) st
.st_size
;
484 ctx
.kernelSource
[0] = (char *) calloc(ctx
.kernelSource_len
+ 1, sizeof(char)); // size + \0
485 if (!ctx
.kernelSource
[0]) {
486 printf("Error: calloc (ctx.kernelSource[0]) failed (%d): %s\n", errno
, strerror(errno
));
490 MEMORY_FREE_ADD(ctx
.kernelSource
[0])
492 if (read(fd
, ctx
.kernelSource
[0], ctx
.kernelSource_len
) < (ssize_t
) ctx
.kernelSource_len
) {
493 printf("Error: read (%s) failed (%d): %s\n", opencl_kernel
, errno
, strerror(errno
));
499 ctx
.kernelSource
[0][ctx
.kernelSource_len
] = '\0';
505 cl_uint ocl_platform_cnt
= 0;
506 size_t selected_platforms_cnt
= 0;
507 size_t selected_devices_cnt
= 0;
508 compute_platform_ctx_t
*cd_ctx
= NULL
;
514 // now discover and set up compute device(s)
515 if ((err
= discoverDevices(profile_selected
, device_types_selected
, &ocl_platform_cnt
, &selected_platforms_cnt
, &selected_devices_cnt
, &cd_ctx
, plat_sel
, plat_cnt
, dev_sel
, dev_cnt
, verbose
, show
)) != 0) {
516 printf("Error: discoverDevices() failed\n");
528 // new selection engine, need to support multi-gpu system (with the same platform)
534 MEMORY_FREE_ADD(cd_ctx
)
536 if (selected_platforms_cnt
== 0) {
537 printf("! No platform was selected ...\n");
542 if (selected_devices_cnt
== 0) {
543 printf("! No device(s) was selected ...\n");
548 size_t w
= 0, q
= 0, g
= 0;
550 size_t z
= 0; // z is a dolphin's friend
552 // show selected devices
554 printf("Selected %zu OpenCL Device(s)\n\n", selected_devices_cnt
);
556 for (w
= 0; w
< ocl_platform_cnt
; w
++) {
557 if (!cd_ctx
[w
].selected
) {
561 for (q
= 0; q
< cd_ctx
[w
].device_cnt
; q
++) {
562 if (!cd_ctx
[w
].device
[q
].selected
) {
566 if (cd_ctx
[w
].is_apple
&& !strncmp(cd_ctx
[w
].device
[q
].vendor
, "Intel", 5)) {
567 // disable hitag2 with apple platform and not apple device vendor (< Apple M1)
568 ctx
.force_hitag2_opencl
= false;
571 printf("%2zu - %s", z
, cd_ctx
[w
].device
[q
].name
);
573 printf(" (Lop3 %s, ", (cd_ctx
[w
].device
[q
].have_lop3
) ? "yes" : "no");
574 printf("Local Memory %s)", (cd_ctx
[w
].device
[q
].have_local_memory
) ? "yes" : "no");
583 if (selected_devices_cnt
!= z
) {
584 printf("BUG: z and selected_devices_cnt are not equal\n");
589 // time to eat some memory :P
591 if (!(ctx
.device_ids
= (cl_device_id
*) calloc(selected_devices_cnt
, sizeof(cl_device_id
)))) {
592 printf("Error: calloc (ctx.device_ids) failed (%d): %s\n", errno
, strerror(errno
));
597 MEMORY_FREE_ADD(ctx
.device_ids
)
599 if (!(ctx
.contexts
= (cl_context
*) calloc(selected_devices_cnt
, sizeof(cl_context
)))) {
600 printf("Error: calloc (ctx.contexts) failed (%d): %s\n", errno
, strerror(errno
));
605 MEMORY_FREE_ADD(ctx
.contexts
)
607 if (!(ctx
.commands
= (cl_command_queue
*) calloc(selected_devices_cnt
, sizeof(cl_command_queue
)))) {
608 printf("Error: calloc (ctx.commands) failed (%d): %s\n", errno
, strerror(errno
));
613 MEMORY_FREE_ADD(ctx
.commands
)
615 if (!(ctx
.programs
= (cl_program
*) calloc(selected_devices_cnt
, sizeof(cl_program
)))) {
616 printf("Error: calloc (ctx.programs) failed (%d): %s\n", errno
, strerror(errno
));
621 MEMORY_FREE_ADD(ctx
.programs
)
623 if (!(ctx
.kernels
= (cl_kernel
*) calloc(selected_devices_cnt
, sizeof(cl_kernel
)))) {
624 printf("Error: calloc (ctx.kernels) failed (%d): %s\n", errno
, strerror(errno
));
629 MEMORY_FREE_ADD(ctx
.kernels
)
631 if (!(matches
= (uint64_t **) calloc(selected_devices_cnt
, sizeof(uint64_t *)))) {
632 printf("Error: calloc (**matches) failed (%d): %s\n", errno
, strerror(errno
));
637 MEMORY_FREE_ADD(matches
)
639 if (!(matches_found
= (uint32_t **) calloc(selected_devices_cnt
, sizeof(uint32_t *)))) {
640 printf("Error: calloc (**matches_found) failed (%d): %s\n", errno
, strerror(errno
));
645 MEMORY_FREE_ADD(matches_found
)
647 if (!(ctx
.keystreams
= (cl_mem
*) calloc(selected_devices_cnt
, sizeof(cl_mem
)))) {
648 printf("Error: calloc (ctx.keystreams) failed (%d): %s\n", errno
, strerror(errno
));
653 MEMORY_FREE_ADD(ctx
.keystreams
)
655 if (!(ctx
.candidates
= (cl_mem
*) calloc(selected_devices_cnt
, sizeof(cl_mem
)))) {
656 printf("Error: calloc (ctx.candidates) failed (%d): %s\n", errno
, strerror(errno
));
661 MEMORY_FREE_ADD(ctx
.candidates
)
663 if (!(ctx
.matches
= (cl_mem
*) calloc(selected_devices_cnt
, sizeof(cl_mem
)))) {
664 printf("Error: calloc (ctx.matches) failed (%d): %s\n", errno
, strerror(errno
));
669 MEMORY_FREE_ADD(ctx
.matches
)
671 if (!(ctx
.matches_found
= (cl_mem
*) calloc(selected_devices_cnt
, sizeof(cl_mem
)))) {
672 printf("Error: calloc (ctx.matches_found) failed (%d): %s\n", errno
, strerror(errno
));
677 MEMORY_FREE_ADD(ctx
.matches_found
)
679 if (ctx
.force_hitag2_opencl
) {
680 if (!(ctx
.checks
= (cl_mem
*) calloc(selected_devices_cnt
, sizeof(cl_mem
)))) {
681 printf("Error: calloc (ctx.checks) failed (%d): %s\n", errno
, strerror(errno
));
686 MEMORY_FREE_ADD(ctx
.checks
)
689 if (!(ctx
.global_ws
= (size_t *) calloc(selected_devices_cnt
, sizeof(size_t)))) {
690 printf("Error: calloc (ctx.global_ws) failed (%d): %s\n", errno
, strerror(errno
));
695 MEMORY_FREE_ADD(ctx
.global_ws
)
697 if (!(ctx
.local_ws
= (size_t *) calloc(selected_devices_cnt
, sizeof(size_t)))) {
698 printf("Error: calloc (ctx.local_ws) failed (%d): %s\n", errno
, strerror(errno
));
703 MEMORY_FREE_ADD(ctx
.local_ws
)
705 // show buidlog in case of error
706 // todo: only for device models
707 unsigned int build_errors
= 0;
708 // unsigned int build_logs = 0;
710 cl_command_queue_properties queue_properties
= 0;
712 if (opencl_profiling
) {
713 queue_properties
= CL_QUEUE_PROFILING_ENABLE
;
720 for (w
= 0; w
< ocl_platform_cnt
; w
++) {
721 if (!cd_ctx
[w
].selected
) {
725 for (q
= 0; q
< cd_ctx
[w
].device_cnt
; q
++) {
727 if (!cd_ctx
[w
].device
[q
].selected
) {
731 ctx
.device_ids
[z
] = cd_ctx
[w
].device
[q
].device_id
;
733 // create the opencl context with the array
734 ctx
.contexts
[z
] = clCreateContext(NULL
, 1, &ctx
.device_ids
[z
], NULL
, NULL
, &err
);
735 if (!ctx
.contexts
[z
] || err
!= CL_SUCCESS
) {
736 printf("[%zu] Error: clCreateContext() failed (%d)\n", z
, err
);
737 MEMORY_FREE_OPENCL(ctx
, z
)
738 MEMORY_FREE_LIST(matches
, z
)
739 MEMORY_FREE_LIST(matches_found
, z
)
744 // create comman queues for each selected devices
745 ctx
.commands
[z
] = clCreateCommandQueue(ctx
.contexts
[z
], ctx
.device_ids
[z
], queue_properties
, &err
);
746 if (!ctx
.commands
[z
] || err
!= CL_SUCCESS
) {
747 printf("[%zu] Error: clCreateCommandQueue() failed (%d)\n", z
, err
);
748 MEMORY_FREE_OPENCL(ctx
, z
)
749 MEMORY_FREE_LIST(matches
, z
)
750 MEMORY_FREE_LIST(matches_found
, z
)
756 // warning: cast from 'char *(*)[1]' to 'const char **' increases required alignment from 1 to 8
757 const char *a
= (const char *)ctx
.kernelSource
[0];
758 const char **ks
= &a
;
760 // create the compute program from the source buffer
761 //ctx.programs[z] = clCreateProgramWithSource(ctx.contexts[z], 1, (const char **) &ctx.kernelSource, &ctx.kernelSource_len, &err);
762 ctx
.programs
[z
] = clCreateProgramWithSource(ctx
.contexts
[z
], 1, ks
, &ctx
.kernelSource_len
, &err
);
763 if (!ctx
.programs
[z
] || err
!= CL_SUCCESS
) {
764 printf("[%zu] Error: clCreateProgramWithSource() failed (%d)\n", z
, err
);
765 MEMORY_FREE_OPENCL(ctx
, z
)
766 MEMORY_FREE_LIST(matches
, z
)
767 MEMORY_FREE_LIST(matches_found
, z
)
772 // build the program executable
773 bool have_local_memory
= false;
775 char build_options
[0x100];
777 memset(build_options
, 0, sizeof(build_options
));
779 strncpy(build_options
, "-Werror", 8);
782 if (cd_ctx
[w
].device
[q
].have_lop3
) { // enable lop3
783 strncpy(build_options
+ blen
, " -D HAVE_LOP3", 14);
787 if (ctx
.force_hitag2_opencl
) {
788 // force using hitag2 key validation with OpenCL
789 strncpy(build_options
+ blen
, " -D WITH_HITAG2_FULL", 21);
793 // Intel's gpu are worst than Apple
794 #if APPLE_GPU_BROKEN == 0
795 if (cd_ctx
[w
].device
[q
].is_gpu
&& !strncmp(cd_ctx
[w
].device
[q
].vendor
, "Intel", 5)) {
796 if (cd_ctx
[w
].is_apple
|| cd_ctx
[w
].is_intel
) {
797 strncpy(build_options
+ blen
, " -D LOWPERF", 13);
804 printf("[debug] Device %zu have local mem ? %d\n", z
, cd_ctx
[w
].device
[q
].have_local_memory
);
807 if (cd_ctx
[w
].device
[q
].have_local_memory
) { // kernel keystream memory optimization
808 have_local_memory
= true;
809 strncpy(build_options
+ blen
, " -D HAVE_LOCAL_MEMORY", 22);
814 printf("[%zu] Building OpenCL program with options (len %zu): %s\n", z
, blen
, build_options
);
817 err
= clBuildProgram(ctx
.programs
[z
], 1, &ctx
.device_ids
[z
], build_options
, NULL
, NULL
);
820 if (err
!= CL_SUCCESS
)
824 if (err
!= CL_SUCCESS
)
827 printf("[%zu] Error: clBuildProgram() failed (%d)\n", z
, err
);
831 // todo: if same device model of other and build_logs > 0, continue
834 err
= clGetProgramBuildInfo(ctx
.programs
[z
], cd_ctx
[w
].device
[q
].device_id
, CL_PROGRAM_BUILD_LOG
, 0, NULL
, &len
);
835 if (err
!= CL_SUCCESS
) {
836 printf("[%zu] Error: clGetProgramBuildInfo failed (%d)\n", z
, err
);
844 if (len
> 0xdeadbe) {
845 len
= 0xdeadbe; // limit build_log size
848 char *buffer
= (char *) calloc(len
, sizeof(char));
850 printf("[%zu] Error: calloc (CL_PROGRAM_BUILD_LOG) failed (%d): %s\n", z
, errno
, strerror(errno
));
854 err
= clGetProgramBuildInfo(ctx
.programs
[z
], cd_ctx
[w
].device
[q
].device_id
, CL_PROGRAM_BUILD_LOG
, len
, buffer
, 0);
855 if (err
!= CL_SUCCESS
) {
856 printf("[%zu] clGetProgramBuildInfo() failed (%d)\n", z
, err
);
865 printf("[%zu] Build log (len %zu):\n--------\n%s\n--------\n", z
, len
, buffer
);
872 continue; // todo: evaluate this, one or more can be broken, so continue
876 // todo, continue if build_errors
878 // Create the compute kernel in the program we wish to run
879 ctx
.kernels
[z
] = clCreateKernel(ctx
.programs
[z
], "find_state", &err
);
880 if (!ctx
.kernels
[z
] || err
!= CL_SUCCESS
) {
881 printf("[%zu] Error: clCreateKernel() failed (%d)\n", z
, err
);
882 MEMORY_FREE_OPENCL(ctx
, z
)
883 MEMORY_FREE_LIST(matches
, z
)
884 MEMORY_FREE_LIST(matches_found
, z
)
890 err
= clGetKernelWorkGroupInfo(ctx
.kernels
[z
], cd_ctx
[w
].device
[q
].device_id
, CL_KERNEL_WORK_GROUP_SIZE
, sizeof(size_t), &wgs
, NULL
);
891 if (err
!= CL_SUCCESS
) {
892 printf("[%zu] Error: clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE) failed (%d)\n", z
, err
);
893 MEMORY_FREE_OPENCL(ctx
, z
)
894 // if macros work, next 2 macro are not needed
895 MEMORY_FREE_LIST(matches
, z
)
896 MEMORY_FREE_LIST(matches_found
, z
)
901 ctx
.local_ws
[z
] = wgs
;
904 if (ctx
.local_ws
[z
] < 32 && have_local_memory
) {
905 printf("Warning: local work-item size is less than the length of the keystream, and local memory optimization is enabled. An unexpected result could arise.\n");
912 // setup, phase 2 (select lower profile)
913 unsigned int profile
= get_smallest_profile(cd_ctx
, ocl_platform_cnt
);
915 // setup, phase 3 (finis him)
917 // z is device counter, dolphin buggy counter as well
921 for (w
= 0; w
< ocl_platform_cnt
; w
++) {
923 if (!cd_ctx
[w
].selected
) {
927 for (q
= 0; q
< cd_ctx
[w
].device_cnt
; q
++) {
929 if (!cd_ctx
[w
].device
[q
].selected
) {
933 ctx
.global_ws
[z
] = (1 << profiles
[profile
][1]);
935 // the following happens with cpu devices or Apple GPU
936 if (ctx
.local_ws
[z
] > 256) {
938 if (cd_ctx
[w
].is_apple
) {
939 ctx
.local_ws
[z
] = 256;
940 } else if (!cd_ctx
[w
].device
[q
].is_gpu
) {
941 ctx
.local_ws
[z
] = 256;
945 // dow't allow gws < lws
946 if (ctx
.global_ws
[z
] < ctx
.local_ws
[z
]) {
947 ctx
.local_ws
[z
] = ctx
.global_ws
[z
];
950 if (opencl_profiling
) {
951 printf("[%zu] global_ws %zu, local_ws %zu\n", g
, ctx
.global_ws
[z
], ctx
.local_ws
[z
]);
954 if (!ctx
.force_hitag2_opencl
) {
956 if (!(matches
[z
] = (uint64_t *) calloc((uint32_t)(ctx
.global_ws
[z
] * WGS_MATCHES_FACTOR
), sizeof(uint64_t)))) {
957 printf("[%zu] Error: calloc (matches) failed (%d): %s\n", g
, errno
, strerror(errno
));
958 MEMORY_FREE_OPENCL(ctx
, z
)
959 MEMORY_FREE_LIST(matches
, z
)
960 MEMORY_FREE_LIST(matches_found
, z
)
967 if (!(matches
[z
] = (uint64_t *) calloc(1, sizeof(uint64_t)))) {
968 printf("[%zu] Error: calloc (matches) failed (%d): %s\n", z
, errno
, strerror(errno
));
969 MEMORY_FREE_OPENCL(ctx
, z
)
970 MEMORY_FREE_LIST(matches
, z
)
971 MEMORY_FREE_LIST(matches_found
, z
)
977 if (!(matches_found
[z
] = (uint32_t *) calloc(1, sizeof(uint32_t)))) {
978 printf("[%zu] Error: calloc (matches_found) failed (%d): %s\n", z
, errno
, strerror(errno
));
979 MEMORY_FREE_OPENCL(ctx
, z
)
980 MEMORY_FREE_LIST_Z(matches
, z
)
981 MEMORY_FREE_LIST(matches_found
, z
)
986 ctx
.candidates
[z
] = clCreateBuffer(ctx
.contexts
[z
], CL_MEM_READ_ONLY
, sizeof(uint16_t) * ((1 << 20) * 3), NULL
, NULL
);
987 //ctx.candidates = clCreateBuffer(ctx.contexts[z], CL_MEM_READ_ONLY, sizeof(uint64_t) * ((1 << 20)), NULL, NULL);
988 ctx
.keystreams
[z
] = clCreateBuffer(ctx
.contexts
[z
], CL_MEM_READ_ONLY
, VECTOR_SIZE
* 32, NULL
, NULL
);
990 if (!ctx
.force_hitag2_opencl
) {
991 ctx
.matches
[z
] = clCreateBuffer(ctx
.contexts
[z
], CL_MEM_WRITE_ONLY
, sizeof(uint64_t) * (uint32_t)(ctx
.global_ws
[z
] * WGS_MATCHES_FACTOR
), NULL
, NULL
);
993 ctx
.matches
[z
] = clCreateBuffer(ctx
.contexts
[z
], CL_MEM_WRITE_ONLY
, sizeof(uint64_t), NULL
, NULL
);
996 ctx
.matches_found
[z
] = clCreateBuffer(ctx
.contexts
[z
], CL_MEM_READ_WRITE
, sizeof(uint32_t), NULL
, NULL
);
998 if (ctx
.force_hitag2_opencl
) {
999 ctx
.checks
[z
] = clCreateBuffer(ctx
.contexts
[z
], CL_MEM_READ_ONLY
, sizeof(uint32_t) * 4, NULL
, NULL
);
1000 if (!ctx
.checks
[z
]) {
1001 printf("[%zu] Error: invalid shared cl_mem (ctx.candidates|ctx.keystream|ctx.checks)\n", z
);
1002 MEMORY_FREE_OPENCL(ctx
, z
)
1003 MEMORY_FREE_LIST_Z(matches
, z
)
1004 MEMORY_FREE_LIST_Z(matches_found
, z
)
1010 if (!ctx
.candidates
[z
] || !ctx
.keystreams
[z
]) {
1011 printf("[%zu] Error: invalid shared cl_mem (ctx.candidates|ctx.keystream)\n", z
);
1012 MEMORY_FREE_OPENCL(ctx
, z
)
1013 MEMORY_FREE_LIST_Z(matches
, z
)
1014 MEMORY_FREE_LIST_Z(matches_found
, z
)
1019 if (!ctx
.matches
[z
] || !ctx
.matches_found
[z
]) {
1020 printf("[%zu] Error: invalid per-device cl_mem (ctx.matches or ctx.matches_found)\n", z
);
1021 MEMORY_FREE_OPENCL(ctx
, z
)
1022 MEMORY_FREE_LIST_Z(matches
, z
)
1023 MEMORY_FREE_LIST_Z(matches_found
, z
)
1028 // Write our data set into the input array in device memory
1030 // if z is last set CL_TRUE (blocking) else CL_FALSE (non-blocking)
1031 // using this way, setup time can be reduced
1032 err
= clEnqueueWriteBuffer(ctx
.commands
[z
], ctx
.keystreams
[z
], CL_TRUE
, 0, VECTOR_SIZE
* 32, keystream
, 0, NULL
, NULL
);
1033 if (err
!= CL_SUCCESS
) {
1034 printf("[%zu] Error: clEnqueueWriteBuffer(ctx.keystream) failed (%d)\n", z
, err
);
1035 MEMORY_FREE_OPENCL(ctx
, z
)
1036 MEMORY_FREE_LIST_Z(matches
, z
)
1037 MEMORY_FREE_LIST_Z(matches_found
, z
)
1042 err
= clEnqueueWriteBuffer(ctx
.commands
[z
], ctx
.candidates
[z
], CL_TRUE
, 0, sizeof(uint16_t) * ((1 << 20) * 3), candidates
, 0, NULL
, NULL
);
1043 // err = clEnqueueWriteBuffer(ctx.commands[z], ctx.candidates, CL_TRUE, 0, sizeof(uint64_t) * ((1 << 20)), candidates, 0, NULL, NULL);
1044 if (err
!= CL_SUCCESS
) {
1045 printf("[%zu] Error: clEnqueueWriteBuffer(ctx.candidates) failed (%d)\n", z
, err
);
1046 MEMORY_FREE_OPENCL(ctx
, z
)
1047 MEMORY_FREE_LIST_Z(matches
, z
)
1048 MEMORY_FREE_LIST_Z(matches_found
, z
)
1053 if (ctx
.force_hitag2_opencl
) {
1054 err
= clEnqueueWriteBuffer(ctx
.commands
[z
], ctx
.checks
[z
], CL_TRUE
, 0, sizeof(uint32_t) * 4, checks
, 0, NULL
, NULL
);
1055 if (err
!= CL_SUCCESS
) {
1056 printf("[%zu] Error: clEnqueueWriteBuffer(ctx.checks) failed (%d)\n", z
, err
);
1057 MEMORY_FREE_OPENCL(ctx
, z
)
1058 MEMORY_FREE_LIST_Z(matches
, z
)
1059 MEMORY_FREE_LIST_Z(matches_found
, z
)
1065 // Set the arguments to our compute kernel
1066 err
= clSetKernelArg(ctx
.kernels
[z
], 1, sizeof(cl_mem
), &ctx
.candidates
[z
]);
1067 err
|= clSetKernelArg(ctx
.kernels
[z
], 2, sizeof(cl_mem
), &ctx
.keystreams
[z
]);
1068 err
|= clSetKernelArg(ctx
.kernels
[z
], 3, sizeof(cl_mem
), &ctx
.matches
[z
]);
1069 if (ctx
.force_hitag2_opencl
) err
|= clSetKernelArg(ctx
.kernels
[z
], 5, sizeof(cl_mem
), &ctx
.checks
[z
]);
1071 if (err
!= CL_SUCCESS
) {
1072 printf("[%zu] Error: clSetKernelArg(ctx.candidates|ctx.keystream|ctx.matches|ctx.checks) failed (%d)\n", z
, err
);
1073 MEMORY_FREE_OPENCL(ctx
, z
)
1074 MEMORY_FREE_LIST_Z(matches
, z
)
1075 MEMORY_FREE_LIST_Z(matches_found
, z
)
1084 if (build_errors
> 0) {
1086 printf("[debug] Detected build errors with %u device(s).\n", build_errors
);
1088 MEMORY_FREE_OPENCL(ctx
, z
)
1089 MEMORY_FREE_LIST_Z(matches
, z
)
1090 MEMORY_FREE_LIST_Z(matches_found
, z
)
1095 // at this point z is the max value, still usefulfor free's
1098 printf("[debug] Lower profile between %zu device(s) is: %d\n", selected_devices_cnt
, profile
);
1101 uint32_t max_step
= profiles
[profile
][0];
1102 uint32_t chunk
= profiles
[profile
][1];
1105 printf("[debug] Init queue\n");
1109 if ((ret
= wu_queue_init(&ctx
.queue_ctx
, queue_type
)) != 0) {
1110 printf("! Error: wu_queue_init(%s) failed (%d): %s\n", wu_queue_strdesc(queue_type
), ret
, wu_queue_strerror(ret
));
1111 MEMORY_FREE_OPENCL(ctx
, z
)
1112 MEMORY_FREE_LIST_Z(matches
, z
)
1113 MEMORY_FREE_LIST_Z(matches_found
, z
)
1119 printf("[queue] Fill queue with pre-calculated offset using profile (%d): ", profile
);
1122 for (size_t step
= 0; step
< max_step
; step
++) {
1123 wu_queue_push(&ctx
.queue_ctx
, step
, step
<< chunk
, max_step
);
1130 // save selected_devices_cnt
1131 size_t thread_count
= selected_devices_cnt
;
1133 thread_ctx_t th_ctx
;
1134 memset(&th_ctx
, 0, sizeof(thread_ctx_t
));
1136 thread_args_t
*t_arg
= (thread_args_t
*) calloc(thread_count
, sizeof(thread_args_t
));
1138 printf("Error: calloc (thread_args_t) failed (%d): %s\n", errno
, strerror(errno
));
1139 MEMORY_FREE_OPENCL(ctx
, z
)
1140 MEMORY_FREE_LIST_Z(matches
, z
)
1141 MEMORY_FREE_LIST_Z(matches_found
, z
)
1146 MEMORY_FREE_ADD(t_arg
)
1148 if ((ret
= thread_init(&th_ctx
, ctx
.thread_sched_type
, thread_count
)) != 0) {
1149 printf("Error: thread_init(%zu) failed (%d)\n", thread_count
, ret
);
1150 MEMORY_FREE_OPENCL(ctx
, z
)
1151 MEMORY_FREE_LIST_Z(matches
, z
)
1152 MEMORY_FREE_LIST_Z(matches_found
, z
)
1157 // preload constant values in threads memory, and start threads
1158 for (z
= 0; z
< thread_count
; z
++) {
1163 t_arg
[z
].max_slices
= max_step
;
1164 t_arg
[z
].ocl_ctx
= &ctx
;
1165 t_arg
[z
].device_id
= z
;
1166 t_arg
[z
].thread_ctx
= &th_ctx
;
1168 t_arg
[z
].matches
= matches
[z
];
1169 t_arg
[z
].matches_found
= matches_found
[z
];
1170 t_arg
[z
].status
= TH_START
;
1173 if (ctx
.thread_sched_type
== THREAD_TYPE_ASYNC
) {
1174 if ((ret
= thread_start(&th_ctx
, t_arg
)) != 0) {
1175 printf("Error: thread_start() failed (%d): %s\n", ret
, thread_strerror(ret
));
1176 thread_destroy(&th_ctx
);
1177 MEMORY_FREE_OPENCL(ctx
, z
)
1178 MEMORY_FREE_LIST_Z(matches
, z
)
1179 MEMORY_FREE_LIST_Z(matches_found
, z
)
1186 // they now are all in TH_WAIT locked by a cond_wait
1187 // try the normal routine
1188 if (ctx
.thread_sched_type
== THREAD_TYPE_ASYNC
) {
1189 size_t th_status_err
= 0;
1190 for (z
= 0; z
< thread_count
; z
++) {
1191 pthread_mutex_lock(&th_ctx
.thread_mutexs
[z
]);
1192 thread_status_t tmp
= t_arg
[z
].status
;
1193 pthread_mutex_unlock(&th_ctx
.thread_mutexs
[z
]);
1195 if (tmp
!= TH_START
) {
1196 printf("! Warning: Thread %zu is not in TH_START, found in %s\n", z
, thread_status_strdesc(tmp
));
1201 if (th_status_err
!= 0) {
1202 printf("! Warning: %zu thread(s) found in wrong initial state ...\n", th_status_err
);
1204 printf("# %zu thread(s) ready\n", thread_count
);
1207 #endif // DEBUGME >= 1
1211 bool show_overall_time
= true;
1213 struct timeval cpu_t_start
, cpu_t_end
, cpu_t_result
;
1215 printf("Attack 5 - opencl - start (Max Slices %u, %s order", max_step
, wu_queue_strdesc(ctx
.queue_ctx
.queue_type
));
1220 printf(", Profile %u, Async Threads %s, HiTag2 key verify on device %s)\n\n"
1222 , (ctx
.thread_sched_type
== THREAD_TYPE_ASYNC
) ? "yes" : "no"
1223 , (force_hitag2_opencl
) ? "yes" : "no"
1227 if (gettimeofday(&cpu_t_start
, NULL
) == -1) {
1228 printf("Error: gettimeofday(start) failed (%d): %s\n", errno
, strerror(errno
));
1229 show_overall_time
= false;
1232 // Hokuto Hyakuretsu Ken
1233 ret
= thread_start_scheduler(&th_ctx
, t_arg
, &ctx
.queue_ctx
);
1235 printf("Error: thread_start_scheduler() failed (%d): %s\n", ret
, thread_strerror(ret
));
1237 } else if (ret
== 0) {
1241 // if found, show the key here
1242 for (size_t y
= 0; y
< thread_count
; y
++) {
1250 if (thread_count
> 1) {
1251 printf("[%zu] ", y
);
1254 printf("\nKey found @ slice %zu/%zu [ \x1b[32m"
1256 , t_arg
[y
].max_slices
1259 for (int i
= 0; i
< 6; i
++) {
1260 printf("%02X", (uint8_t)(t_arg
[y
].key
& 0xff));
1261 t_arg
[y
].key
= t_arg
[y
].key
>> 8;
1263 printf(AEND
" ]\n");
1269 if (show_overall_time
) {
1270 if (gettimeofday(&cpu_t_end
, NULL
) == 0) {
1271 timersub(&cpu_t_end
, &cpu_t_start
, &cpu_t_result
);
1273 printf("Error. gettimeofday(end) failed (%d): %s\n", errno
, strerror(errno
));
1274 show_overall_time
= false;
1278 if (found
== false) {
1280 printf("\nSomething went wrong ( " _RED_("fail") " )\n");
1282 printf("\nExhausted keyspace ( " _RED_("fail") " )\n");
1286 printf("\nAttack 5 - opencl - end");
1288 if (show_overall_time
) {
1289 printf(" in " _YELLOW_("%ld.%2ld") " second(s).\n\n", (long int)cpu_t_result
.tv_sec
, (long int)cpu_t_result
.tv_usec
);
1297 printf("stop threads\n");
1301 if (error
== false && th_ctx
.type
!= THREAD_TYPE_SEQ
) {
1302 thread_stop(&th_ctx
);
1306 printf("destroy threads\n");
1310 if (error
== false) {
1311 if ((ret
= thread_destroy(&th_ctx
)) != 0) {
1313 printf("Warning: thread_destroy() failed (%d): %s\n", ret
, thread_strerror(ret
));
1319 printf("wu_queue_destroy\n");
1323 if ((ret
= wu_queue_destroy(&ctx
.queue_ctx
)) != 0) {
1325 printf("Warning: wu_queue_destroy() failed (%d): %s\n", ret
, wu_queue_strerror(ret
));
1329 z
= selected_devices_cnt
- 1;
1330 MEMORY_FREE_OPENCL(ctx
, z
)
1331 MEMORY_FREE_LIST_Z(matches
, z
)
1332 MEMORY_FREE_LIST_Z(matches_found
, z
)
1335 return (found
) ? 0 : 1;