sq epan/dissectors/pidl/rcg/rcg.cnf
[wireshark-sm.git] / tools / json2pcap / json2pcap.py
blobbaa64b64585dc217b3935f05def4caada184a570
1 #!/usr/bin/env python3
4 # Copyright 2020, Martin Kacer <kacer.martin[AT]gmail.com> and contributors
6 # Wireshark - Network traffic analyzer
7 # By Gerald Combs <gerald@wireshark.org>
8 # Copyright 1998 Gerald Combs
10 # SPDX-License-Identifier: GPL-2.0-or-later
12 import sys
13 import ijson
14 import operator
15 import copy
16 import binascii
17 import array
18 import argparse
19 import string
20 import random
21 import math
22 import hashlib
23 import re
24 from collections import OrderedDict
25 from scapy import all as scapy
27 # Field anonymization class
28 class AnonymizedField:
29 '''
30 The Anonymization field object specifying anonymization
31 :field arg: field name
32 :type arg: anonymization type [0 masking 0xff, 1 anonymization shake_256]
33 :start arg: If specified, the anonymization starts at given byte number
34 :end arg: If specified, the anonymization ends at given byte number
35 '''
36 def __init__(self, field, type):
37 self.field = field
38 self.type = type
39 self.start = None
40 self.end = None
42 match = re.search(r'(\S+)\[(-?\d+)?:(-?\d+)?\]', field)
43 if match:
44 self.field = match.group(1)
45 self.start = match.group(2)
46 if self.start is not None:
47 self.start = int(self.start)
48 self.end = match.group(3)
49 if self.end is not None:
50 self.end = int(self.end)
52 # Returns the new field value after anonymization
53 def anonymize_field_shake256(self, field, type, salt):
54 shake = hashlib.shake_256(str(field + ':' + salt).encode('utf-8'))
56 # String type, output should be ASCII
57 if type in [26, 27, 28]:
58 length = math.ceil(len(field)/4)
59 shake_hash = shake.hexdigest(length)
60 ret_string = array.array('B', str.encode(shake_hash))
61 ret_string = ''.join('{:02x}'.format(x) for x in ret_string)
62 # Other types, output could be HEX
63 else:
64 length = math.ceil(len(field)/2)
65 shake_hash = shake.hexdigest(length)
66 ret_string = shake_hash
68 # Correct the string length
69 if (len(ret_string) < len(field)):
70 ret_string = ret_string.ljust(len(field))
71 if (len(ret_string) > len(field)):
72 ret_string = ret_string[:len(field)]
74 return ret_string
76 def anonymize_field(self, _h, _t, salt):
77 s = 0
78 e = None
79 if self.start:
80 s = self.start
81 if self.end:
82 e = self.end
83 if e < 0:
84 e = len(_h) + e
85 else:
86 e = len(_h)
87 h = _h[s:e]
88 if self.type == 0:
89 h = 'f' * len(h)
90 elif self.type == 1:
91 h = self.anonymize_field_shake256(h, _t, salt)
93 h_mask = '0' * len(_h[0:s]) + 'f' * len(h) + '0' * len(_h[e:])
94 h = _h[0:s] + h + _h[e:]
95 return [h, h_mask]
97 def make_unique(key, dct):
98 counter = 0
99 unique_key = key
101 while unique_key in dct:
102 counter += 1
103 unique_key = '{}_{}'.format(key, counter)
104 return unique_key
107 def parse_object_pairs(pairs):
108 dct = OrderedDict()
109 for key, value in pairs:
110 if key in dct:
111 key = make_unique(key, dct)
112 dct[key] = value
114 return dct
117 # ********* PY TEMPLATES *********
119 def read_py_function(name):
120 s = ''
121 record = False
122 indent = 0
124 file = open(__file__)
125 for line in file:
127 ind = len(line) - len(line.lstrip())
129 if line.find("def " + name) != -1:
130 record = True
131 indent = ind
132 elif record and indent == ind and len(line) > 1:
133 record = False
135 if record:
136 s = s + line
138 file.close()
139 return s
141 py_header = """#!/usr/bin/env python
142 # -*- coding: utf-8 -*-
144 # File generated by json2pcap.py
145 # json2pcap.py created by Martin Kacer, 2020
147 import os
148 import binascii
149 import array
150 import sys
151 import subprocess
152 from collections import OrderedDict
153 from scapy import all as scapy
155 # *****************************************************
156 # * PACKET PAYLOAD GENERATED FROM INPUT PCAP *
157 # * Modify this function to edit the packet *
158 # *****************************************************
159 def main():
160 d = OrderedDict()
163 py_footer = """ generate_pcap(d)
165 # *****************************************************
166 # * FUNCTIONS from TEMPLATE *
167 # * Do not edit these functions if not required *
168 # *****************************************************
171 py_footer = py_footer + read_py_function("to_bytes")
172 py_footer = py_footer + read_py_function("lsb")
173 py_footer = py_footer + read_py_function("multiply_strings")
174 py_footer = py_footer + read_py_function("rewrite_frame")
175 py_footer = py_footer + read_py_function("assemble_frame")
176 py_footer = py_footer + read_py_function("generate_pcap")
178 py_footer = py_footer + """
180 if __name__ == '__main__':
181 main()
184 # ***** End of PY TEMPLATES ******
190 # ********** FUNCTIONS ***********
193 def raw_flat_collector(dict):
194 if hasattr(dict, 'items'):
195 for k, v in dict.items():
196 if k.endswith("_raw"):
197 yield k, v
198 else:
199 for val in raw_flat_collector(v):
200 yield val
203 # d - input dictionary, parsed from json
204 # r - result dictionary
205 # frame_name - parent protocol name
206 # frame_position - parent protocol position
207 def py_generator(d, r, frame_name='frame_raw', frame_position=0):
208 if (d is None or d is None):
209 return
211 if hasattr(d, 'items'):
212 for k, v in d.items():
214 # no recursion
215 if k.endswith("_raw") or "_raw_" in k:
216 if isinstance(v[1], (list, tuple)) or isinstance(v[2], (list, tuple)):
217 #i = 1;
218 for _v in v:
219 h = _v[0]
220 p = _v[1]
221 l = _v[2] * 2
222 b = _v[3]
223 t = _v[4]
224 if (len(h) != l):
225 l = len(h)
227 p = p - frame_position
229 # Add into result dictionary
230 key = str(k).replace('.', '_')
231 key = make_unique(key, r)
233 fn = frame_name.replace('.', '_')
234 if (fn == key):
235 fn = None
236 value = [fn, h, p, l, b, t]
238 r[key] = value
240 else:
241 h = v[0]
242 p = v[1]
243 l = v[2] * 2
244 b = v[3]
245 t = v[4]
246 if (len(h) != l):
247 l = len(h)
249 p = p - frame_position
251 # Add into result dictionary
252 key = str(k).replace('.', '_')
253 key = make_unique(key, r)
255 fn = frame_name.replace('.', '_')
256 if (fn == key):
257 fn = None
258 value = [fn , h, p, l, b, t]
260 r[key] = value
262 # recursion
263 else:
264 if isinstance(v, dict):
265 fn = frame_name
266 fp = frame_position
268 # if there is also preceding raw protocol frame use it
269 # remove tree suffix
270 key = k
271 if (key.endswith("_tree") or ("_tree_" in key)):
272 key = key.replace('_tree', '')
274 raw_key = key + "_raw"
275 if (raw_key in d):
276 # f = d[raw_key][0]
277 fn = raw_key
278 fp = d[raw_key][1]
281 py_generator(v, r, fn, fp)
283 elif isinstance(v, (list, tuple)):
285 fn = frame_name
286 fp = frame_position
288 # if there is also preceding raw protocol frame use it
289 # remove tree suffix
290 key = k
291 if (key.endswith("_tree") or ("_tree_" in key)):
292 key = key.replace('_tree', '')
294 raw_key = key + "_raw"
295 if (raw_key in d):
296 fn = raw_key
297 fp = d[raw_key][1]
299 for _v in v:
300 py_generator(_v, r, frame_name, frame_position)
302 # To emulate Python 3.2
303 def to_bytes(n, length, endianess='big'):
304 h = '%x' % n
305 s = bytearray.fromhex(('0' * (len(h) % 2) + h).zfill(length * 2))
306 return s if endianess == 'big' else s[::-1]
308 # Returns the index, counting from 0, of the least significant set bit in x
309 def lsb(x):
310 return (x & -x).bit_length() - 1
312 # Replace parts of original_string by new_string, only if mask in the byte is not ff
313 def multiply_strings(original_string, new_string, mask):
315 ret_string = new_string
316 if mask is None:
317 return ret_string
318 for i in range(0, min(len(original_string), len(new_string), len(mask)), 2):
319 if mask[i:i + 2] == 'ff':
320 #print("ff")
321 ret_string = ret_string[:i] + original_string[i:i + 2] + ret_string[i + 2:]
323 return ret_string
325 # Rewrite frame
326 # h - hex bytes
327 # p - position
328 # l - length
329 # b - bitmask
330 # t - type
331 # frame_amask - optional, anonymization mask (00 - not anonymized byte, ff - anonymized byte)
332 def rewrite_frame(frame_raw, h, p, l, b, t, frame_amask=None):
333 if p < 0 or l < 0 or h is None:
334 return frame_raw
336 # no bitmask
337 if(b == 0):
338 if (len(h) != l):
339 l = len(h)
340 frame_raw_new = frame_raw[:p] + h + frame_raw[p + l:]
341 return multiply_strings(frame_raw, frame_raw_new, frame_amask)
342 # bitmask
343 else:
344 # get hex string from frame which will be replaced
345 _h = frame_raw[p:p + l]
347 # add 0 padding to have correct length
348 if (len(_h) % 2 == 1):
349 _h = '0' + _h
350 if (len(h) % 2 == 1):
351 h = '0' + h
353 # Only replace bits defined by mask
354 # new_hex = (old_hex & !mask) | (new_hex & mask)
355 _H = bytearray.fromhex(_h)
356 _H = array.array('B', _H)
358 M = to_bytes(b, len(_H))
359 M = array.array('B', M)
360 # shift mask aligned to position
361 for i in range(len(M)):
362 if (i + p / 2) < len(M):
363 M[i] = M[i + int(p / 2)]
364 else:
365 M[i] = 0x00
367 H = bytearray.fromhex(h)
368 H = array.array('B', H)
370 # for i in range(len(_H)):
371 # print "{0:08b}".format(_H[i]),
372 # print
373 # for i in range(len(M)):
374 # print "{0:08b}".format(M[i]),
375 # print
377 j = 0
378 for i in range(len(_H)):
379 if (M[i] != 0):
380 v = H[j] << lsb(M[i])
381 # print "Debug: {0:08b}".format(v),
382 _H[i] = (_H[i] & ~M[i]) | (v & M[i])
383 # print "Debug: " + str(_H[i]),
384 j = j + 1
386 # for i in range(len(_H)):
387 # print "{0:08b}".format(_H[i]),
388 # print
390 masked_h = binascii.hexlify(_H)
391 masked_h = masked_h.decode('ascii')
393 frame_raw_new = frame_raw[:p] + str(masked_h) + frame_raw[p + l:]
394 return multiply_strings(frame_raw, frame_raw_new, frame_amask)
397 def assemble_frame(d, frame_time):
398 input = d['frame_raw'][1]
399 isFlat = False
400 linux_cooked_header = False
401 while not isFlat:
402 isFlat = True
403 _d = d.copy()
404 for key, val in _d.items():
405 h = str(val[1]) # hex
406 p = val[2] * 2 # position
407 l = val[3] * 2 # length
408 b = val[4] # bitmask
409 t = val[5] # type
411 if (key == "sll_raw"):
412 linux_cooked_header = True
414 # only if the node is not parent
415 isParent = False
416 for k, v in d.items():
417 if (v[0] == key):
418 isParent = True
419 isFlat = False
420 break
422 if not isParent and val[0] is not None:
423 d[val[0]][1] = rewrite_frame(d[val[0]][1], h, p, l, b, t)
424 del d[key]
426 output = d['frame_raw'][1]
428 # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame
429 if (linux_cooked_header):
430 output = "000000000000" + output[6*2:] # replce dest MAC
431 output = output[:12*2] + "" + output[14*2:] # remove two bytes before Protocol
433 return output
435 def generate_pcap(d):
436 # 1. Assemble frame
437 input = d['frame_raw'][1]
438 output = assemble_frame(d, None)
439 print(input)
440 print(output)
441 # 2. Testing: compare input and output for not modified json
442 if (input != output):
443 print("Modified frames: ")
444 s1 = input
445 s2 = output
446 print(s1)
447 print(s2)
448 if (len(s1) == len(s2)):
449 d = [i for i in range(len(s1)) if s1[i] != s2[i]]
450 print(d)
451 # 3. Generate pcap
452 outfile = sys.argv[0] + ".pcap"
453 pcap_out = scapy.PcapWriter(outfile, append=False, sync=False)
454 new_packet = scapy.Packet(bytearray.fromhex(output))
455 pcap_out.write(new_packet)
456 print("Generated " + outfile)
459 # ************ MAIN **************
461 VERSION = "1.1"
463 parser = argparse.ArgumentParser(description="""
464 json2pcap {version}
466 Utility to generate pcap from json format.
468 Packet modification:
469 In input json it is possible to modify the raw values of decoded fields.
470 The output pcap will include the modified values. The algorithm of
471 generating the output pcap is to get all raw hex fields from input json and
472 then assembling them by layering from longest (less decoded fields) to
473 shortest (more decoded fields). It means if the modified raw field is
474 shorter field (more decoded field) it takes precedence against modification
475 in longer field (less decoded field). If the json includes duplicated raw
476 fields with same position and length, the behavior is not deterministic.
477 For manual packet editing it is always possible to remove any not required
478 raw fields from json, only frame_raw is field mandatory for reconstruction.
480 Packet modification with -p switch:
481 The python script is generated instead of pcap. This python script when
482 executed will generate the pcap of 1st packet from input json. The
483 generated code includes the decoded fields and the function to assembly the
484 packet. This enables to modify the script and programmatically edit or
485 encode the packet variables. The assembling algorithm is different, because
486 the decoded packet fields are relative and points to parent node with their
487 position (compared to input json which has absolute positions).
489 Pcap masking and anonymization with -m and -a switch:
490 The script allows to mask or anonymize the selected json raw fields. If the
491 The fields are selected and located on lower protocol layers, they are not
492 The overwritten by upper fields which are not marked by these switches.
493 The pcap masking and anonymization can be performed in the following way:
495 tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw"
496 -a "ip.dst_raw" -o anonymized.pcap
497 In this example the ip.src_raw field is masked with ffffffff by byte values
498 and ip.dst_raw is hashed by randomly generated salt.
500 Additionally the following syntax is valid to anonymize portion of field
501 tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:]"
502 -a "ip.dst_raw[:-2]" -o anonymized.pcap
503 Where the src_ip first byte is preserved and dst_ip last byte is preserved.
504 And the same can be achieved by
505 tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:8]"
506 -a "ip.dst_raw[0:6]" -o anonymized.pcap
508 Masking and anonymization limitations are mainly the following:
509 - In case the tshark is performing reassembling from multiple frames, the
510 backward pcap reconstruction is not properly performed and can result in
511 malformed frames.
512 - The new values in the fields could violate the field format, as the
513 json2pcap is no performing correct protocol encoding with respect to
514 allowed values of the target field and field encoding.
516 """.format(version=VERSION), formatter_class=argparse.RawTextHelpFormatter)
517 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
518 parser.add_argument('-i', '--infile', nargs='?', help='json generated by tshark -T json -x\nor by tshark -T jsonraw (not preserving frame timestamps).\nIf no inpout file is specified script reads from stdin.')
519 parser.add_argument('-o', '--outfile', required=True, help='output pcap filename')
520 parser.add_argument('-p', '--python', help='generate python payload instead of pcap (only 1st packet)', default=False, action='store_true')
521 parser.add_argument('-m', '--mask', help='mask the specific raw field (e.g. -m "ip.src_raw" -m "ip.dst_raw[2:6]")', action='append', metavar='MASKED_FIELD')
522 parser.add_argument('-a', '--anonymize', help='anonymize the specific raw field (e.g. -a "ip.src_raw[2:]" -a "ip.dst_raw[:-2]")', action='append', metavar='ANONYMIZED_FIELD')
523 parser.add_argument('-s', '--salt', help='salt use for anonymization. If no value is provided it is randomized.', default=None)
524 parser.add_argument('-v', '--verbose', help='verbose output', default=False, action='store_true')
525 args = parser.parse_args()
527 # read JSON
528 infile = args.infile
529 outfile = args.outfile
531 # Read from input file
532 if infile:
533 data_file = open(infile)
534 # Read from pipe
535 else:
536 data_file = sys.stdin
538 # Parse anonymization fields
539 anonymize = {}
540 if args.mask:
541 for m in args.mask:
542 if '_raw' not in m:
543 print("Error: The specified fields by -m switch should be raw fields. " + m + " does not have _raw suffix")
544 sys.exit()
545 af = AnonymizedField(m, 0)
546 anonymize[af.field] = af
547 if args.anonymize:
548 for a in args.anonymize:
549 if '_raw' not in a:
550 print("Error: The specified fields by -a switch should be raw fields. " + a + " does not have _raw suffix")
551 sys.exit()
552 af = AnonymizedField(a, 1)
553 anonymize[af.field] = af
555 input_frame_raw = ''
556 frame_raw = ''
557 frame_time = None
559 salt = args.salt
560 if salt is None:
561 # generate random salt if no salt was provided
562 salt = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10))
564 # Generate pcap
565 if args.python is False:
566 pcap_out = scapy.PcapWriter(outfile, append=False, sync=False)
568 # Iterate over packets in JSON
569 for packet in ijson.items(data_file, "item", buf_size=200000):
570 _list = []
571 linux_cooked_header = False
573 # get flat raw fields into _list
574 for raw in raw_flat_collector(packet['_source']['layers']):
575 if len(raw) >= 2:
576 if (raw[0] == "frame_raw"):
577 frame_raw = raw[1][0]
578 frame_amask = "0"*len(frame_raw) # initialize anonymization mask
579 input_frame_raw = copy.copy(frame_raw)
580 frame_time = None
581 if 'frame.time_epoch' in packet['_source']['layers']['frame']:
582 frame_time = packet['_source']['layers']['frame']['frame.time_epoch']
583 else:
584 # add into value list into raw[5] the field name
585 if isinstance(raw[1], list):
586 raw[1].append(raw[0])
587 _list.append(raw[1])
588 if (raw[0] == "sll_raw"):
589 linux_cooked_header = True
591 # sort _list
592 sorted_list = sorted(_list, key=operator.itemgetter(1), reverse=False)
593 sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True)
594 # print("Debug: " + str(sorted_list))
596 # rewrite frame
597 for raw in sorted_list:
598 if len(raw) >= 6:
599 h = str(raw[0]) # hex
600 p = raw[1] * 2 # position
601 l = raw[2] * 2 # length
602 b = raw[3] # bitmask
603 t = raw[4] # type
604 # raw[5] # field_name (added by script)
605 h_mask = h # hex for anonymization mask
607 # anonymize fields
608 if (raw[5] in anonymize):
609 [h, h_mask] = anonymize[raw[5]].anonymize_field(h, t, salt)
611 if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))):
612 for r in raw:
613 _h = str(r[0]) # hex
614 _p = r[1] * 2 # position
615 _l = r[2] * 2 # length
616 _b = r[3] # bitmask
617 _t = r[4] # type
618 # raw[5] # field_name (added by script)
619 _h_mask = _h # hex for anonymization mask
621 # anonymize fields
622 if (raw[5] in anonymize):
623 [_h, _h_mask] = anonymize[raw[5]].anonymize_field(_h, _t, salt)
625 # print("Debug: " + str(raw))
626 frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t, frame_amask)
628 # update anonymization mask
629 if (raw[5] in anonymize):
630 frame_amask = rewrite_frame(frame_amask, _h_mask, _p, _l, _b, _t)
632 else:
633 # print("Debug: " + str(raw))
634 frame_raw = rewrite_frame(frame_raw, h, p, l, b, t, frame_amask)
636 # update anonymization mask
637 if (raw[5] in anonymize):
638 frame_amask = rewrite_frame(frame_amask, h_mask, p, l, b, t)
640 # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap
641 if (linux_cooked_header):
642 frame_raw = "000000000000" + frame_raw[6 * 2:] # replce dest MAC
643 frame_raw = frame_raw[:12 * 2] + "" + frame_raw[14 * 2:] # remove two bytes before Protocol
645 # Testing: remove comment to compare input and output for not modified json
646 if (args.verbose and input_frame_raw != frame_raw):
647 print("Modified frames: ")
648 s1 = input_frame_raw
649 s2 = frame_raw
650 print(s1)
651 print(s2)
652 if (len(s1) == len(s2)):
653 d = [i for i in range(len(s1)) if s1[i] != s2[i]]
654 print(d)
656 new_packet = scapy.Packet(bytearray.fromhex(frame_raw))
657 if frame_time:
658 new_packet.time = float(frame_time)
659 pcap_out.write(new_packet)
661 # Generate python payload only for first packet
662 else:
663 py_outfile = outfile + '.py'
664 f = open(py_outfile, 'w')
666 #for packet in json:
667 for packet in ijson.items(data_file, "item", buf_size=200000):
668 f.write(py_header)
670 r = OrderedDict({})
672 #print "packet = " + str(packet['_source']['layers'])
673 py_generator(packet['_source']['layers'], r)
675 for key, value in r.items():
676 f.write(" d['" + key + "'] =",)
677 f.write(" " + str(value) + "\n")
679 f.write(py_footer)
681 # Currently only first packet is used from pcap
682 f.close
684 print("Generated " + py_outfile)
686 break