40 lines
1.6 KiB
Python
40 lines
1.6 KiB
Python
import csv, json, argparse
|
|
from collections import defaultdict
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--labeled_csv", default='./label_pool_recording.csv', help="label_pool_recording.csv with rel filled (0/1/2)")
|
|
ap.add_argument("--out_qrels", default='./qrels_recording.jsonl', help="qrels_recording.jsonl")
|
|
ap.add_argument("--include_query_text", action="store_true", help="Copy 'query' field into qrels")
|
|
ap.add_argument("--keep_zero", action="store_true", help="Include rel=0 entries (usually False)")
|
|
args = ap.parse_args()
|
|
|
|
bucket = defaultdict(list) # qid -> list[relevant]
|
|
qtext = {} # qid -> query
|
|
|
|
with open(args.labeled_csv, "r", encoding="utf-8") as f:
|
|
r = csv.DictReader(f)
|
|
for row in r:
|
|
qid = row["qid"]
|
|
if args.include_query_text:
|
|
qtext[qid] = row["query"]
|
|
rel = row.get("rel","").strip()
|
|
if rel == "":
|
|
continue
|
|
rel = int(rel)
|
|
if rel == 0 and not args.keep_zero:
|
|
continue
|
|
item = {"file": row["file"], "chunk_index": int(row["chunk_index"]), "rel": rel}
|
|
bucket[qid].append(item)
|
|
|
|
with open(args.out_qrels, "w", encoding="utf-8") as out:
|
|
for qid, items in bucket.items():
|
|
rec = {"qid": qid, "relevant": items}
|
|
if args.include_query_text:
|
|
rec["query"] = qtext.get(qid)
|
|
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
|
|
print(f"[OK] Wrote: {args.out_qrels}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |