ποΈ OptiML APIs#
1. π§ Platform Independent Inference#
Build your server using the OptiML model and interact with it effortlessly from any programming language or framework.
2. π Python#
# Start the server with OptiML-converted model
./server -m models/llama-7b.optiml.gguf -c 2048
# Python client example
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
path = "llama-7b.optiml.gguf"
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
response, history = model.chat(tokenizer, "Write an article about Artificial Intelligence.", temperature=0.9, top_p=0.9)
print(response)
3. π» C++#
#include <optiml.h>
int main() {
std::string path = "llama-7b.optiml.gguf";
std::string device = "gpu-cpu";
optiml::Tokenizer tokenizer(path);
optiml::Model model = optiml::load(path, optiml::DType::BF16, device);
std::string prompt = "Write an article about Artificial Intelligence.";
std::string response = model.chat(tokenizer, prompt, temperature = 0.9, top_p = 0.9);
std::cout << response << std::endl;
return 0;
}
4. π Node.js#
# Start the server
./server -m models/llama-7b.optiml.gguf -c 2048
# index.js
const prompt = "Write an article about Artificial Intelligence.";
async function chat() {
const res = await fetch("http://127.0.0.1:8080/completion", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ prompt, n_predict: 512 }),
});
const data = await res.json();
console.log(data.content);
}
chat();
# chat with model
node index.js
5. βοΈ React#
// remember run a server
// ./server -m models/llama-7b.optiml.gguf -c 2048
// ChatComponent.jsx
import { useState } from 'react';
export default function ChatComponent() {
const [response, setResponse] = useState('');
const handleChat = async () => {
const res = await fetch('http://localhost:8080/completion', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ prompt: "Write an article about Artificial Intelligence.", n_predict: 128 }),
});
const data = await res.json();
setResponse(data.content);
};
return (
<div>
<button onClick={handleChat}>Send Prompt</button>
<pre>{response}</pre>
</div>
);
}
6. π Next.js#
// start server first
// ./server -m models/llama-7b.optiml.gguf -c 2048
// pages/api/chat.ts
export default async function handler(req, res) {
const r = await fetch('http://localhost:8080/completion', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(req.body),
});
res.status(200).json(await r.json());
}
// components/ChatClient.tsx
'use client';
import { useState } from 'react';
export default function ChatClient() {
const [res, setRes] = useState('');
const ask = async () => {
const r = await fetch('/api/chat', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ prompt: 'Write about AI.', n_predict: 128 }),
});
setRes((await r.json()).content);
};
return <div><button onClick={ask}>Ask</button><pre>{res}</pre></div>;
}