RecordingRAG/03_labels_to_qrels.py

import csv, json, argparse
from collections import defaultdict

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--labeled_csv", default='./label_pool_recording.csv', help="label_pool_recording.csv with rel filled (0/1/2)")
    ap.add_argument("--out_qrels", default='./qrels_recording.jsonl', help="qrels_recording.jsonl")
    ap.add_argument("--include_query_text", action="store_true", help="Copy 'query' field into qrels")
    ap.add_argument("--keep_zero", action="store_true", help="Include rel=0 entries (usually False)")
    args = ap.parse_args()

    bucket = defaultdict(list)   # qid -> list[relevant]
    qtext = {}                   # qid -> query

    with open(args.labeled_csv, "r", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            qid = row["qid"]
            if args.include_query_text:
                qtext[qid] = row["query"]
            rel = row.get("rel","").strip()
            if rel == "":
                continue
            rel = int(rel)
            if rel == 0 and not args.keep_zero:
                continue
            item = {"file": row["file"], "chunk_index": int(row["chunk_index"]), "rel": rel}
            bucket[qid].append(item)

    with open(args.out_qrels, "w", encoding="utf-8") as out:
        for qid, items in bucket.items():
            rec = {"qid": qid, "relevant": items}
            if args.include_query_text:
                rec["query"] = qtext.get(qid)
            out.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print(f"[OK] Wrote: {args.out_qrels}")

if __name__ == "__main__":
    main()