import textwrap

import colorama
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import logging as t_logging
from transformers import pipeline

# Configure logging level for transformers library
t_logging.logging.set_verbosity_info()


# Utility Functions
def get_prompt(human_prompt):
    prompt_template = f"### Human: {human_prompt} \n### Assistant:"
    return prompt_template


def remove_human_text(text):
    return text.split("### Human:", 1)[0]


def parse_text(data):
    for item in data:
        text = item["generated_text"]
        assistant_text_index = text.find("### Assistant:")
        if assistant_text_index != -1:
            assistant_text = text[
                assistant_text_index + len("### Assistant:") :
            ].strip()
            assistant_text = remove_human_text(assistant_text)
            wrapped_text = textwrap.fill(assistant_text, width=100)
            print(wrapped_text)


# Reasoning question
EXAMPLE_REASONING = "Answer the following question by reasoning step by step. \
    The cafeteria had 22 apples. If they used 20 for lunch, and bought 6 more, \
    how many apple do they have?"


# User interface
def main(model_dir):
    # Model loading for inference
    tokenizer = LlamaTokenizer.from_pretrained(model_dir)

    base_model = LlamaForCausalLM.from_pretrained(
        model_dir,
        load_in_8bit=True,
        device_map="auto",
    )

    pipe = pipeline(
        "text-generation",
        model=base_model,
        tokenizer=tokenizer,
        max_length=512,
        temperature=0.7,
        top_p=0.95,
        repetition_penalty=1.15,
    )

    print("Reading for inference!")
    while True:
        input_prompt = ""
        input_prompt = input("USER:")
        print(colorama.Style.DIM + f"You are submitting: {input_prompt}")
        print(colorama.Style.RESET_ALL)
        raw_output = pipe(get_prompt(input_prompt))
        parse_text(raw_output)


if __name__ == "__main__":
    print("Warming up the engines...")
    main()