RecordingRAG/03_labels_to_qrels.py

40 lines
1.6 KiB
Python

import csv, json, argparse
from collections import defaultdict
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--labeled_csv", default='./label_pool_recording.csv', help="label_pool_recording.csv with rel filled (0/1/2)")
ap.add_argument("--out_qrels", default='./qrels_recording.jsonl', help="qrels_recording.jsonl")
ap.add_argument("--include_query_text", action="store_true", help="Copy 'query' field into qrels")
ap.add_argument("--keep_zero", action="store_true", help="Include rel=0 entries (usually False)")
args = ap.parse_args()
bucket = defaultdict(list) # qid -> list[relevant]
qtext = {} # qid -> query
with open(args.labeled_csv, "r", encoding="utf-8") as f:
r = csv.DictReader(f)
for row in r:
qid = row["qid"]
if args.include_query_text:
qtext[qid] = row["query"]
rel = row.get("rel","").strip()
if rel == "":
continue
rel = int(rel)
if rel == 0 and not args.keep_zero:
continue
item = {"file": row["file"], "chunk_index": int(row["chunk_index"]), "rel": rel}
bucket[qid].append(item)
with open(args.out_qrels, "w", encoding="utf-8") as out:
for qid, items in bucket.items():
rec = {"qid": qid, "relevant": items}
if args.include_query_text:
rec["query"] = qtext.get(qid)
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"[OK] Wrote: {args.out_qrels}")
if __name__ == "__main__":
main()