2 # Postprocess click data files.
4 # Copyright (C) 2017 Vivek Pal
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License as
8 # published by the Free Software Foundation; either version 2 of the
9 # License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 """Postprocesses search and clicks data files.
23 Generates the final clickstream log file and query file for Xapian Letor
24 module from that log file.
27 from __future__
import print_function
37 def generate_combined_log(search_log
, clicks_log
, final_log
):
38 """Generates the final log file.
41 search_log (str): Path to the search log file.
42 clicks_log (str): Path to the clicks log file.
43 final_log (str): Path to the final log file.
45 Example (comma-delimited) entries in search_log:
47 821f03288846297c2cf43c34766a38f7,"book","45,54",0
48 d41d8cd98f00b204e9800998ecf8427e,"","".0
49 d41d8cd98f00b204e9800998ecf8427e,"",""0
50 098f6bcd4621d373cade4e832627b4f6,"test","35,47",0
52 Example (comma-delimited) entries in clicks_log:
54 821f03288846297c2cf43c34766a38f7,54
55 821f03288846297c2cf43c34766a38f7,54
56 098f6bcd4621d373cade4e832627b4f6,35
58 Example (comma-delimited) entries in final_log:
60 QueryID,Query,Hits,Offset,Clicks
61 821f03288846297c2cf43c34766a38f7,book,"45,54",0,"45:0,54:2"
62 098f6bcd4621d373cade4e832627b4f6,test,"35,47",0,"35:1,47:0"
64 QUERYID
, QUERY
, HITS
= 0, 1, 2
67 qid_to_clicks
= collections
.defaultdict(dict)
71 with
open(clicks_log
, 'r') as clicks_f
:
72 clicks_reader
= csv
.reader(clicks_f
)
74 # Build map: qid_to_clicks = {qid: {did: click_count}}
75 for row
in clicks_reader
:
76 qid
, did
= row
[QUERYID
], row
[DOCID
]
78 did_to_count
= qid_to_clicks
[qid
]
80 # Check if did is already present in did_to_count
81 if did
in did_to_count
:
82 # Update did_to_count[did]
83 did_to_count
[did
] += 1
87 qid_to_clicks
[qid
] = did_to_count
89 with
open(search_log
, 'r') as search_f
, open(final_log
, 'w+') as final_f
:
90 search_reader
= csv
.reader(search_f
)
91 writer
= csv
.writer(final_f
)
93 # Add headers to final log file
94 writer
.writerow(["QueryID", "Query", "Hits", "Offset", "Clicks"])
98 for row
in search_reader
:
99 # Skip rows with empty Query string or empty Hitlist
100 if row
[QUERY
] == '' or row
[HITS
] == "":
103 # Avoid duplicate entries
104 if row
[QUERY
] in queries
:
107 queries
.add(row
[QUERY
])
109 # Convert Hitlist from str to list
114 hits
= hits
.strip().split(',')
119 # Update clicklist with click values stored in map.
120 if row
[QUERYID
] in qid_to_clicks
:
121 did_to_count
= qid_to_clicks
[row
[QUERYID
]]
122 for index
, did
in enumerate(clicklist
):
123 if did
in did_to_count
:
124 clicklist
[index
] = '%s:%i' % (did
, did_to_count
[did
])
126 clicklist
[index
] = did
+ ':0'
128 for index
, did
in enumerate(clicklist
):
129 clicklist
[index
] = did
+ ':0'
131 # Serialise "Hits" and "Clicks"
132 row
[HITS
] = ','.join(row
[HITS
])
133 clicklist
= ','.join(clicklist
)
135 row
.append(clicklist
)
139 def generate_query_file(final_log
, query_file
):
140 """Generates query file formatted as per Xapian Letor documentation.
143 final_log (string): Path to final log file.
144 query_file (string): Path to query file.
146 Example (comma-delimited) entries in final_log:
148 QueryID,Query,Hits,Offset,Clicks
149 821f03288846297c2cf43c34766a38f7,book,"45,54",0,"45:0,54:2"
150 098f6bcd4621d373cade4e832627b4f6,test,"35,47",0,"35:1,47:0"
152 Example (comma-delimited) entries in query_file:
154 821f03288846297c2cf43c34766a38f7,book
155 098f6bcd4621d373cade4e832627b4f6,test
157 with
open(final_log
, 'r') as s
, open(query_file
, 'w+') as w
:
158 reader
= csv
.DictReader(s
)
159 writer
= csv
.writer(w
)
162 writer
.writerow([row
['QueryID'], row
['Query']])
165 def test_functions():
167 test_search
= tempfile
.NamedTemporaryFile(delete
=True)
169 test_search
.write('821f03288846297c2cf43c34766a38f7,"book","45,54",0\n')
170 test_search
.write('821f03288846297c2cf43c34766a38f7,"book","45,54",0\n')
171 test_search
.write('d41d8cd98f00b204e9800998ecf8427e,"","",0\n')
172 test_search
.write('098f6bcd4621d373cade4e832627b4f6,"test","35,47",0\n')
175 test_clicks
= tempfile
.NamedTemporaryFile(delete
=True)
177 test_clicks
.write('821f03288846297c2cf43c34766a38f7,54\n')
178 test_clicks
.write('821f03288846297c2cf43c34766a38f7,54\n')
179 test_clicks
.write('098f6bcd4621d373cade4e832627b4f6,35\n')
182 test_final
= tempfile
.NamedTemporaryFile(delete
=True)
183 generate_combined_log(test_search
.name
, test_clicks
.name
, test_final
.name
)
186 test_query
= tempfile
.NamedTemporaryFile(delete
=True)
187 generate_query_file(test_final
.name
, test_query
.name
)
190 # Test entries in final log are correct.
191 with
open(test_final
.name
, 'r') as final_f
:
192 reader
= csv
.reader(final_f
)
197 for i
, row
in enumerate(reader
):
199 assert row
== ['821f03288846297c2cf43c34766a38f7',
200 'book', '45,54', '0', '45:0,54:2'], "Incorrect entry in final.log"
202 assert row
== ['098f6bcd4621d373cade4e832627b4f6',
203 'test', '35,47', '0', '35:1,47:0'], "Incorrect entry in final.log"
205 # Test entries in query file are correct.
206 with
open(test_query
.name
, 'r') as query_f
:
207 reader
= csv
.reader(query_f
)
209 for i
, row
in enumerate(reader
):
211 assert row
== ['821f03288846297c2cf43c34766a38f7','book'], "Incorrect entry in query.txt"
213 assert row
== ['098f6bcd4621d373cade4e832627b4f6','test'], "Incorrect entry in query.txt"
215 # Close all files to safely delete them.
219 print("Error: %s - %s." % (e
.filename
, e
.strerror
))
223 print("Error: %s - %s." % (e
.filename
, e
.strerror
))
227 print("Error: %s - %s." % (e
.filename
, e
.strerror
))
231 print("Error: %s - %s." % (e
.filename
, e
.strerror
))
234 if __name__
== '__main__':
235 parser
= argparse
.ArgumentParser(
236 description
='''Postprocess click data files.
238 This script generates the final clickstream log file from input search and
239 click log files and creates query file that can be used by the Xapian Letor
240 module for generating its training files.''',
241 formatter_class
=argparse
.RawTextHelpFormatter
)
242 parser
.add_argument("search_log", type=str, help="Path to the search.log file.")
243 parser
.add_argument("clicks_log", type=str, help="Path to the clicks.log file.")
244 parser
.add_argument("final_log", type=str, help="Path to save final.log file.")
245 parser
.add_argument("query_file", type=str, help="Path to save query.txt file.")
246 parser
.add_argument("--test", help="Run tests for this script.", action
='store_true')
247 args
= parser
.parse_args()
254 generate_combined_log(args
.search_log
, args
.clicks_log
, args
.final_log
)
255 generate_query_file(args
.final_log
, args
.query_file
)
257 print(e
, file=sys
.stderr
)