from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Instantiate the tokenizer, i.e. how to turn a sentence into numbers so that
# we can pass it as input to the model.
tokenizer = AutoTokenizer.from_pretrained(
    "deepseek-ai/deepseek-coder-1.3b-instruct", 
    trust_remote_code=True,
)

# Instantiate the model. The first time you run this, it will download the model.
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/deepseek-coder-1.3b-instruct", 
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
)

# Format a prompt to pass to the LLM.
messages=[
    { 
        'role': 'user', 
        'content': "Write quicksort in Python.",
    }
]

# Tokenize the prompt
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True, 
    return_tensors="pt",
).to(model.device)

# Generate a response from the LLM!
with torch.no_grad():
    outputs = model.generate(
        inputs, 
        max_new_tokens=512,  # Return a max of 512 words
        do_sample=True, 
        top_k=50, 
        top_p=0.95, 
        eos_token_id=tokenizer.eos_token_id,  # explicitly set EOS token
    )

# Grab the response from the tokenizer.
# Note that we grab the output starting from len(inputs[0]), otherwise, we will see the prompt in our response.
response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
print(response)
