RecordingRAG/01_generate_questions.py

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4, 5, 6, 7"

import json
import torch
import pandas as pd
from tqdm import tqdm
import argparse, warnings
from random import sample
from transformers import AutoTokenizer, AutoModelForCausalLM

warnings.filterwarnings("ignore")

def parse_args():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", default="openai/gpt-oss-20b", help="HF model id")
    ap.add_argument("--batch_size", type=int, default=2)
    ap.add_argument("--max_new_tokens", type=int, default=1024)
    ap.add_argument("--reasoning_effort", default="medium", choices=["low", "medium", "high"])
    ap.add_argument("--chunks", default="./lecture_chunks.json")
    ap.add_argument("--out", default='out')
    return ap.parse_args()

# SYSTEM_TEMPLATE = (
#     """You are an expert university teaching assistant. Your task is to generate clear, relevant, and technically accurate questions from provided academic content. Please follow these instructions:

#     1. Carefully read the CONTEXT_TEXT.
#     2. Assess whether the context contains sufficient technical content, terminology, or concepts suitable for generating a meaningful question-answer (QA) pair.
#     3. If the context is appropriate, create a concise, well-formed question that tests understanding of a key technical concept or detail from the context. The question should be clear and unambiguous.
#     4. If the context is not suitable (e.g., lacks technical content or is purely administrative or casual talk), do not generate a question.
#     5. Output ONLY the question in the final channel. Do not include explanations, answers, or any other text. The output must be a single question, and nothing else.
#     """
#     )

SYSTEM_TEMPLATE = (
    """You are an expert Univeristy TA. Here are your instructions:
        1. Read the CONTEXT_TEXT.
        2. Decide whether this context is useful for the task of generating QA pairs.
        3. If the given context has good amount of technical words and concepts, generate a simple question.
        4. Output ONLY the question compulsorily in the final channel. Do not OUTPUT anything else apart from the question.
        """
)

def build_messages(text, effort="medium", shots=None):

    system_msg = SYSTEM_TEMPLATE.format(effort=effort)
    user_msg = (
        f"CONTEXT_TEXT: {text}\n"
        f"QUESTION: "
    )

    messages = [{"role": "system", "content": system_msg}]

    if shots:
        messages.extend(shots)

    messages.append({"role": "user", "content": user_msg})
    return messages

def extract_final_text(tokenizer, generated_ids):

    text = tokenizer.decode(generated_ids, skip_special_tokens=False)
    if "<|channel|>final<|message|>" in text:
        final = text.split("<|channel|>final<|message|>")[-1]
        final = final.split("<|end|>")[0]
        return final.replace("<|return|>", "").strip()

    return text.strip()

def infer_one(text, model, tokenizer, reasoning_effort="medium", max_new_tokens=1024):
    messages = build_messages(text, effort=reasoning_effort)

    enc = tokenizer.apply_chat_template(
        messages,
        reasoning_effort=reasoning_effort,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )

    input_ids = enc["input_ids"].to(model.device)
    attn_mask = enc["attention_mask"].to(model.device)

    with torch.no_grad():
        gen = model.generate(
            input_ids=input_ids,
            attention_mask=attn_mask,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )

    return extract_final_text(tokenizer, gen[0])


def main():

    args = parse_args()

    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
    tokenizer.padding_side = "left"

    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model = AutoModelForCausalLM.from_pretrained(
        args.model,
        torch_dtype="auto",
        device_map="auto",
        trust_remote_code=True
    )

    with open(args.chunks) as f:
        chunks = json.load(f)

    # number of questions
    sampled = sample(list(enumerate(chunks)), 15)
    message_lists = [(chunk_id, build_messages(c.get("text") or c.get("chunk"))) for chunk_id, c in sampled]

    prompts = []

    for chunk_id, message_list in message_lists:
        prompt_str = tokenizer.apply_chat_template(
            message_list,
            reasoning_effort=args.reasoning_effort,
            add_generation_prompt=True,
            tokenize=False
        )
        prompts.append((chunk_id, prompt_str))

    text = """Welcome, everyone. My name is Alejandro Kuratomi, and I will be leading this lecture on classification, as well as a follow-up lecture on explainability later in the course. Today\u2019s focus is classification, a fundamental task in machine learning. Given its importance, you\u2019ve likely encountered it in previous courses within the program, and most of the topics we\u2019ll cover should be at least somewhat familiar. \n\nTo gauge your background, how many of you have a science, technology, or engineering background? [Pause for response]. This lecture will delve deeper into these familiar concepts, becoming more mathematically rigorous. I encourage you to interrupt me at any time if you have questions or need clarification. \n\nSpecifically, we will explore Naive Bayes and the Naive Bayes classifier \u2013 a quick, easy-to-implement model based on the assumption of feature conditional independence, which we will discuss in detail. We will also examine Support Vector Machines (SVMs). You\u2019ve likely been introduced to SVMs in prior coursework, perhaps as early as the fourth semester. We\u2019ll move beyond a superficial understanding to explore the mathematical foundations of SVMs, which are particularly strong in this area. The derivation of SVMs is complex but logical and intuitive, and I hope you\u2019ll find it insightful.\n\nHere\u2019s the agenda for today: Naive Bayes Classifier (NBC), and then Kernel methods."""

    output = infer_one(text, model, tokenizer)
    print(output)

    results = []
    for i in tqdm(range(0, len(prompts), args.batch_size)):

        batch = prompts[i:i + args.batch_size]
        batch_ids, batch_prompts = zip(*batch)

        encodings = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")

        with torch.no_grad():
            outputs = model.generate(
                **encodings,
                max_new_tokens=args.max_new_tokens,
                do_sample=False,
                eos_token_id=tokenizer.convert_tokens_to_ids("END_OF_QUESTIONS")
            )

        for chunk_id, output_ids in zip(batch_ids, outputs):

            final = extract_final_text(tokenizer, output_ids)
            results.append({
                "chunk_id": chunk_id,
                "questions": final
            })

    outpath = os.path.join(args.out, "questions.jsonl")
    with open(outpath, "w") as f:
        for r in results:
            f.write(json.dumps(r) + "\n")

    df = pd.DataFrame(results)
    csv_path = os.path.join(args.out, "questions.csv")
    df.to_csv(csv_path, index=False)

    print(df.head())

if __name__ == "__main__":
    main()