import csv, json, argparse from collections import defaultdict def main(): ap = argparse.ArgumentParser() ap.add_argument("--labeled_csv", default='./label_pool_recording.csv', help="label_pool_recording.csv with rel filled (0/1/2)") ap.add_argument("--out_qrels", default='./qrels_recording.jsonl', help="qrels_recording.jsonl") ap.add_argument("--include_query_text", action="store_true", help="Copy 'query' field into qrels") ap.add_argument("--keep_zero", action="store_true", help="Include rel=0 entries (usually False)") args = ap.parse_args() bucket = defaultdict(list) # qid -> list[relevant] qtext = {} # qid -> query with open(args.labeled_csv, "r", encoding="utf-8") as f: r = csv.DictReader(f) for row in r: qid = row["qid"] if args.include_query_text: qtext[qid] = row["query"] rel = row.get("rel","").strip() if rel == "": continue rel = int(rel) if rel == 0 and not args.keep_zero: continue item = {"file": row["file"], "chunk_index": int(row["chunk_index"]), "rel": rel} bucket[qid].append(item) with open(args.out_qrels, "w", encoding="utf-8") as out: for qid, items in bucket.items(): rec = {"qid": qid, "relevant": items} if args.include_query_text: rec["query"] = qtext.get(qid) out.write(json.dumps(rec, ensure_ascii=False) + "\n") print(f"[OK] Wrote: {args.out_qrels}") if __name__ == "__main__": main()