4 # Copyright 2020, Martin Kacer <kacer.martin[AT]gmail.com> and contributors
6 # Wireshark - Network traffic analyzer
7 # By Gerald Combs <gerald@wireshark.org>
8 # Copyright 1998 Gerald Combs
10 # SPDX-License-Identifier: GPL-2.0-or-later
24 from collections
import OrderedDict
25 from scapy
import all
as scapy
27 # Field anonymization class
28 class AnonymizedField
:
30 The Anonymization field object specifying anonymization
31 :field arg: field name
32 :type arg: anonymization type [0 masking 0xff, 1 anonymization shake_256]
33 :start arg: If specified, the anonymization starts at given byte number
34 :end arg: If specified, the anonymization ends at given byte number
36 def __init__(self
, field
, type):
42 match
= re
.search(r
'(\S+)\[(-?\d+)?:(-?\d+)?\]', field
)
44 self
.field
= match
.group(1)
45 self
.start
= match
.group(2)
46 if self
.start
is not None:
47 self
.start
= int(self
.start
)
48 self
.end
= match
.group(3)
49 if self
.end
is not None:
50 self
.end
= int(self
.end
)
52 # Returns the new field value after anonymization
53 def anonymize_field_shake256(self
, field
, type, salt
):
54 shake
= hashlib
.shake_256(str(field
+ ':' + salt
).encode('utf-8'))
56 # String type, output should be ASCII
57 if type in [26, 27, 28]:
58 length
= math
.ceil(len(field
)/4)
59 shake_hash
= shake
.hexdigest(length
)
60 ret_string
= array
.array('B', str.encode(shake_hash
))
61 ret_string
= ''.join('{:02x}'.format(x
) for x
in ret_string
)
62 # Other types, output could be HEX
64 length
= math
.ceil(len(field
)/2)
65 shake_hash
= shake
.hexdigest(length
)
66 ret_string
= shake_hash
68 # Correct the string length
69 if (len(ret_string
) < len(field
)):
70 ret_string
= ret_string
.ljust(len(field
))
71 if (len(ret_string
) > len(field
)):
72 ret_string
= ret_string
[:len(field
)]
76 def anonymize_field(self
, _h
, _t
, salt
):
91 h
= self
.anonymize_field_shake256(h
, _t
, salt
)
93 h_mask
= '0' * len(_h
[0:s
]) + 'f' * len(h
) + '0' * len(_h
[e
:])
94 h
= _h
[0:s
] + h
+ _h
[e
:]
97 def make_unique(key
, dct
):
101 while unique_key
in dct
:
103 unique_key
= '{}_{}'.format(key
, counter
)
107 def parse_object_pairs(pairs
):
109 for key
, value
in pairs
:
111 key
= make_unique(key
, dct
)
117 # ********* PY TEMPLATES *********
119 def read_py_function(name
):
124 file = open(__file__
)
127 ind
= len(line
) - len(line
.lstrip())
129 if line
.find("def " + name
) != -1:
132 elif record
and indent
== ind
and len(line
) > 1:
141 py_header
= """#!/usr/bin/env python
142 # -*- coding: utf-8 -*-
144 # File generated by json2pcap.py
145 # json2pcap.py created by Martin Kacer, 2020
152 from collections import OrderedDict
153 from scapy import all as scapy
155 # *****************************************************
156 # * PACKET PAYLOAD GENERATED FROM INPUT PCAP *
157 # * Modify this function to edit the packet *
158 # *****************************************************
163 py_footer
= """ generate_pcap(d)
165 # *****************************************************
166 # * FUNCTIONS from TEMPLATE *
167 # * Do not edit these functions if not required *
168 # *****************************************************
171 py_footer
= py_footer
+ read_py_function("to_bytes")
172 py_footer
= py_footer
+ read_py_function("lsb")
173 py_footer
= py_footer
+ read_py_function("multiply_strings")
174 py_footer
= py_footer
+ read_py_function("rewrite_frame")
175 py_footer
= py_footer
+ read_py_function("assemble_frame")
176 py_footer
= py_footer
+ read_py_function("generate_pcap")
178 py_footer
= py_footer
+ """
180 if __name__ == '__main__':
184 # ***** End of PY TEMPLATES ******
190 # ********** FUNCTIONS ***********
193 def raw_flat_collector(dict):
194 if hasattr(dict, 'items'):
195 for k
, v
in dict.items():
196 if k
.endswith("_raw"):
199 for val
in raw_flat_collector(v
):
203 # d - input dictionary, parsed from json
204 # r - result dictionary
205 # frame_name - parent protocol name
206 # frame_position - parent protocol position
207 def py_generator(d
, r
, frame_name
='frame_raw', frame_position
=0):
208 if (d
is None or d
is None):
211 if hasattr(d
, 'items'):
212 for k
, v
in d
.items():
215 if k
.endswith("_raw") or "_raw_" in k
:
216 if isinstance(v
[1], (list, tuple)) or isinstance(v
[2], (list, tuple)):
227 p
= p
- frame_position
229 # Add into result dictionary
230 key
= str(k
).replace('.', '_')
231 key
= make_unique(key
, r
)
233 fn
= frame_name
.replace('.', '_')
236 value
= [fn
, h
, p
, l
, b
, t
]
249 p
= p
- frame_position
251 # Add into result dictionary
252 key
= str(k
).replace('.', '_')
253 key
= make_unique(key
, r
)
255 fn
= frame_name
.replace('.', '_')
258 value
= [fn
, h
, p
, l
, b
, t
]
264 if isinstance(v
, dict):
268 # if there is also preceding raw protocol frame use it
271 if (key
.endswith("_tree") or ("_tree_" in key
)):
272 key
= key
.replace('_tree', '')
274 raw_key
= key
+ "_raw"
281 py_generator(v
, r
, fn
, fp
)
283 elif isinstance(v
, (list, tuple)):
288 # if there is also preceding raw protocol frame use it
291 if (key
.endswith("_tree") or ("_tree_" in key
)):
292 key
= key
.replace('_tree', '')
294 raw_key
= key
+ "_raw"
300 py_generator(_v
, r
, frame_name
, frame_position
)
302 # To emulate Python 3.2
303 def to_bytes(n
, length
, endianess
='big'):
305 s
= bytearray
.fromhex(('0' * (len(h
) % 2) + h
).zfill(length
* 2))
306 return s
if endianess
== 'big' else s
[::-1]
308 # Returns the index, counting from 0, of the least significant set bit in x
310 return (x
& -x
).bit_length() - 1
312 # Replace parts of original_string by new_string, only if mask in the byte is not ff
313 def multiply_strings(original_string
, new_string
, mask
):
315 ret_string
= new_string
318 for i
in range(0, min(len(original_string
), len(new_string
), len(mask
)), 2):
319 if mask
[i
:i
+ 2] == 'ff':
321 ret_string
= ret_string
[:i
] + original_string
[i
:i
+ 2] + ret_string
[i
+ 2:]
331 # frame_amask - optional, anonymization mask (00 - not anonymized byte, ff - anonymized byte)
332 def rewrite_frame(frame_raw
, h
, p
, l
, b
, t
, frame_amask
=None):
333 if p
< 0 or l
< 0 or h
is None:
340 frame_raw_new
= frame_raw
[:p
] + h
+ frame_raw
[p
+ l
:]
341 return multiply_strings(frame_raw
, frame_raw_new
, frame_amask
)
344 # get hex string from frame which will be replaced
345 _h
= frame_raw
[p
:p
+ l
]
347 # add 0 padding to have correct length
348 if (len(_h
) % 2 == 1):
350 if (len(h
) % 2 == 1):
353 # Only replace bits defined by mask
354 # new_hex = (old_hex & !mask) | (new_hex & mask)
355 _H
= bytearray
.fromhex(_h
)
356 _H
= array
.array('B', _H
)
358 M
= to_bytes(b
, len(_H
))
359 M
= array
.array('B', M
)
360 # shift mask aligned to position
361 for i
in range(len(M
)):
362 if (i
+ p
/ 2) < len(M
):
363 M
[i
] = M
[i
+ int(p
/ 2)]
367 H
= bytearray
.fromhex(h
)
368 H
= array
.array('B', H
)
370 # for i in range(len(_H)):
371 # print "{0:08b}".format(_H[i]),
373 # for i in range(len(M)):
374 # print "{0:08b}".format(M[i]),
378 for i
in range(len(_H
)):
380 v
= H
[j
] << lsb(M
[i
])
381 # print "Debug: {0:08b}".format(v),
382 _H
[i
] = (_H
[i
] & ~M
[i
]) |
(v
& M
[i
])
383 # print "Debug: " + str(_H[i]),
386 # for i in range(len(_H)):
387 # print "{0:08b}".format(_H[i]),
390 masked_h
= binascii
.hexlify(_H
)
391 masked_h
= masked_h
.decode('ascii')
393 frame_raw_new
= frame_raw
[:p
] + str(masked_h
) + frame_raw
[p
+ l
:]
394 return multiply_strings(frame_raw
, frame_raw_new
, frame_amask
)
397 def assemble_frame(d
, frame_time
):
398 input = d
['frame_raw'][1]
400 linux_cooked_header
= False
404 for key
, val
in _d
.items():
405 h
= str(val
[1]) # hex
406 p
= val
[2] * 2 # position
407 l
= val
[3] * 2 # length
411 if (key
== "sll_raw"):
412 linux_cooked_header
= True
414 # only if the node is not parent
416 for k
, v
in d
.items():
422 if not isParent
and val
[0] is not None:
423 d
[val
[0]][1] = rewrite_frame(d
[val
[0]][1], h
, p
, l
, b
, t
)
426 output
= d
['frame_raw'][1]
428 # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame
429 if (linux_cooked_header
):
430 output
= "000000000000" + output
[6*2:] # replce dest MAC
431 output
= output
[:12*2] + "" + output
[14*2:] # remove two bytes before Protocol
435 def generate_pcap(d
):
437 input = d
['frame_raw'][1]
438 output
= assemble_frame(d
, None)
441 # 2. Testing: compare input and output for not modified json
442 if (input != output
):
443 print("Modified frames: ")
448 if (len(s1
) == len(s2
)):
449 d
= [i
for i
in range(len(s1
)) if s1
[i
] != s2
[i
]]
452 outfile
= sys
.argv
[0] + ".pcap"
453 pcap_out
= scapy
.PcapWriter(outfile
, append
=False, sync
=False)
454 new_packet
= scapy
.Packet(bytearray
.fromhex(output
))
455 pcap_out
.write(new_packet
)
456 print("Generated " + outfile
)
459 # ************ MAIN **************
463 parser
= argparse
.ArgumentParser(description
="""
466 Utility to generate pcap from json format.
469 In input json it is possible to modify the raw values of decoded fields.
470 The output pcap will include the modified values. The algorithm of
471 generating the output pcap is to get all raw hex fields from input json and
472 then assembling them by layering from longest (less decoded fields) to
473 shortest (more decoded fields). It means if the modified raw field is
474 shorter field (more decoded field) it takes precedence against modification
475 in longer field (less decoded field). If the json includes duplicated raw
476 fields with same position and length, the behavior is not deterministic.
477 For manual packet editing it is always possible to remove any not required
478 raw fields from json, only frame_raw is field mandatory for reconstruction.
480 Packet modification with -p switch:
481 The python script is generated instead of pcap. This python script when
482 executed will generate the pcap of 1st packet from input json. The
483 generated code includes the decoded fields and the function to assembly the
484 packet. This enables to modify the script and programmatically edit or
485 encode the packet variables. The assembling algorithm is different, because
486 the decoded packet fields are relative and points to parent node with their
487 position (compared to input json which has absolute positions).
489 Pcap masking and anonymization with -m and -a switch:
490 The script allows to mask or anonymize the selected json raw fields. If the
491 The fields are selected and located on lower protocol layers, they are not
492 The overwritten by upper fields which are not marked by these switches.
493 The pcap masking and anonymization can be performed in the following way:
495 tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw"
496 -a "ip.dst_raw" -o anonymized.pcap
497 In this example the ip.src_raw field is masked with ffffffff by byte values
498 and ip.dst_raw is hashed by randomly generated salt.
500 Additionally the following syntax is valid to anonymize portion of field
501 tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:]"
502 -a "ip.dst_raw[:-2]" -o anonymized.pcap
503 Where the src_ip first byte is preserved and dst_ip last byte is preserved.
504 And the same can be achieved by
505 tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:8]"
506 -a "ip.dst_raw[0:6]" -o anonymized.pcap
508 Masking and anonymization limitations are mainly the following:
509 - In case the tshark is performing reassembling from multiple frames, the
510 backward pcap reconstruction is not properly performed and can result in
512 - The new values in the fields could violate the field format, as the
513 json2pcap is no performing correct protocol encoding with respect to
514 allowed values of the target field and field encoding.
516 """.format(version
=VERSION
), formatter_class
=argparse
.RawTextHelpFormatter
)
517 parser
.add_argument('--version', action
='version', version
='%(prog)s ' + VERSION
)
518 parser
.add_argument('-i', '--infile', nargs
='?', help='json generated by tshark -T json -x\nor by tshark -T jsonraw (not preserving frame timestamps).\nIf no inpout file is specified script reads from stdin.')
519 parser
.add_argument('-o', '--outfile', required
=True, help='output pcap filename')
520 parser
.add_argument('-p', '--python', help='generate python payload instead of pcap (only 1st packet)', default
=False, action
='store_true')
521 parser
.add_argument('-m', '--mask', help='mask the specific raw field (e.g. -m "ip.src_raw" -m "ip.dst_raw[2:6]")', action
='append', metavar
='MASKED_FIELD')
522 parser
.add_argument('-a', '--anonymize', help='anonymize the specific raw field (e.g. -a "ip.src_raw[2:]" -a "ip.dst_raw[:-2]")', action
='append', metavar
='ANONYMIZED_FIELD')
523 parser
.add_argument('-s', '--salt', help='salt use for anonymization. If no value is provided it is randomized.', default
=None)
524 parser
.add_argument('-v', '--verbose', help='verbose output', default
=False, action
='store_true')
525 args
= parser
.parse_args()
529 outfile
= args
.outfile
531 # Read from input file
533 data_file
= open(infile
)
536 data_file
= sys
.stdin
538 # Parse anonymization fields
543 print("Error: The specified fields by -m switch should be raw fields. " + m
+ " does not have _raw suffix")
545 af
= AnonymizedField(m
, 0)
546 anonymize
[af
.field
] = af
548 for a
in args
.anonymize
:
550 print("Error: The specified fields by -a switch should be raw fields. " + a
+ " does not have _raw suffix")
552 af
= AnonymizedField(a
, 1)
553 anonymize
[af
.field
] = af
561 # generate random salt if no salt was provided
562 salt
= ''.join(random
.SystemRandom().choice(string
.ascii_letters
+ string
.digits
) for _
in range(10))
565 if args
.python
is False:
566 pcap_out
= scapy
.PcapWriter(outfile
, append
=False, sync
=False)
568 # Iterate over packets in JSON
569 for packet
in ijson
.items(data_file
, "item", buf_size
=200000):
571 linux_cooked_header
= False
573 # get flat raw fields into _list
574 for raw
in raw_flat_collector(packet
['_source']['layers']):
576 if (raw
[0] == "frame_raw"):
577 frame_raw
= raw
[1][0]
578 frame_amask
= "0"*len(frame_raw
) # initialize anonymization mask
579 input_frame_raw
= copy
.copy(frame_raw
)
581 if 'frame.time_epoch' in packet
['_source']['layers']['frame']:
582 frame_time
= packet
['_source']['layers']['frame']['frame.time_epoch']
584 # add into value list into raw[5] the field name
585 if isinstance(raw
[1], list):
586 raw
[1].append(raw
[0])
588 if (raw
[0] == "sll_raw"):
589 linux_cooked_header
= True
592 sorted_list
= sorted(_list
, key
=operator
.itemgetter(1), reverse
=False)
593 sorted_list
= sorted(sorted_list
, key
=operator
.itemgetter(2), reverse
=True)
594 # print("Debug: " + str(sorted_list))
597 for raw
in sorted_list
:
599 h
= str(raw
[0]) # hex
600 p
= raw
[1] * 2 # position
601 l
= raw
[2] * 2 # length
604 # raw[5] # field_name (added by script)
605 h_mask
= h
# hex for anonymization mask
608 if (raw
[5] in anonymize
):
609 [h
, h_mask
] = anonymize
[raw
[5]].anonymize_field(h
, t
, salt
)
611 if (isinstance(p
, (list, tuple)) or isinstance(l
, (list, tuple))):
614 _p
= r
[1] * 2 # position
615 _l
= r
[2] * 2 # length
618 # raw[5] # field_name (added by script)
619 _h_mask
= _h
# hex for anonymization mask
622 if (raw
[5] in anonymize
):
623 [_h
, _h_mask
] = anonymize
[raw
[5]].anonymize_field(_h
, _t
, salt
)
625 # print("Debug: " + str(raw))
626 frame_raw
= rewrite_frame(frame_raw
, _h
, _p
, _l
, _b
, _t
, frame_amask
)
628 # update anonymization mask
629 if (raw
[5] in anonymize
):
630 frame_amask
= rewrite_frame(frame_amask
, _h_mask
, _p
, _l
, _b
, _t
)
633 # print("Debug: " + str(raw))
634 frame_raw
= rewrite_frame(frame_raw
, h
, p
, l
, b
, t
, frame_amask
)
636 # update anonymization mask
637 if (raw
[5] in anonymize
):
638 frame_amask
= rewrite_frame(frame_amask
, h_mask
, p
, l
, b
, t
)
640 # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap
641 if (linux_cooked_header
):
642 frame_raw
= "000000000000" + frame_raw
[6 * 2:] # replce dest MAC
643 frame_raw
= frame_raw
[:12 * 2] + "" + frame_raw
[14 * 2:] # remove two bytes before Protocol
645 # Testing: remove comment to compare input and output for not modified json
646 if (args
.verbose
and input_frame_raw
!= frame_raw
):
647 print("Modified frames: ")
652 if (len(s1
) == len(s2
)):
653 d
= [i
for i
in range(len(s1
)) if s1
[i
] != s2
[i
]]
656 new_packet
= scapy
.Packet(bytearray
.fromhex(frame_raw
))
658 new_packet
.time
= float(frame_time
)
659 pcap_out
.write(new_packet
)
661 # Generate python payload only for first packet
663 py_outfile
= outfile
+ '.py'
664 f
= open(py_outfile
, 'w')
667 for packet
in ijson
.items(data_file
, "item", buf_size
=200000):
672 #print "packet = " + str(packet['_source']['layers'])
673 py_generator(packet
['_source']['layers'], r
)
675 for key
, value
in r
.items():
676 f
.write(" d['" + key
+ "'] =",)
677 f
.write(" " + str(value
) + "\n")
681 # Currently only first packet is used from pcap
684 print("Generated " + py_outfile
)