πŸ—οΈ OptiML APIs#

1.  πŸ§  Platform Independent Inference#

Build your server using the OptiML model and interact with it effortlessly from any programming language or framework.

2.  πŸ Python#

# Start the server with OptiML-converted model
./server -m models/llama-7b.optiml.gguf -c 2048

# Python client example
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

path = "llama-7b.optiml.gguf"
device = "cuda"

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)

response, history = model.chat(tokenizer, "Write an article about Artificial Intelligence.", temperature=0.9, top_p=0.9)
print(response)

3.  πŸ’» C++#

#include <optiml.h>

int main() {
    std::string path = "llama-7b.optiml.gguf";
    std::string device = "gpu-cpu";

    optiml::Tokenizer tokenizer(path);

    optiml::Model model = optiml::load(path, optiml::DType::BF16, device);

    std::string prompt = "Write an article about Artificial Intelligence.";
    std::string response = model.chat(tokenizer, prompt, temperature = 0.9, top_p = 0.9);
    std::cout << response << std::endl;

    return 0;
}

4.  πŸŒ Node.js#

# Start the server
./server -m models/llama-7b.optiml.gguf -c 2048

# index.js
const prompt = "Write an article about Artificial Intelligence.";

async function chat() {
  const res = await fetch("http://127.0.0.1:8080/completion", {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({ prompt, n_predict: 512 }),
  });
  const data = await res.json();
  console.log(data.content);
}

chat();

# chat with model
node index.js

5.  βš›οΈ React#

// remember run a server
// ./server -m models/llama-7b.optiml.gguf -c 2048

// ChatComponent.jsx
import { useState } from 'react';

export default function ChatComponent() {
  const [response, setResponse] = useState('');

  const handleChat = async () => {
    const res = await fetch('http://localhost:8080/completion', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ prompt: "Write an article about Artificial Intelligence.", n_predict: 128 }),
    });
    const data = await res.json();
    setResponse(data.content);
  };

  return (
    <div>
      <button onClick={handleChat}>Send Prompt</button>
      <pre>{response}</pre>
    </div>
  );
}

6.  πŸš€ Next.js#

// start server first
// ./server -m models/llama-7b.optiml.gguf -c 2048

// pages/api/chat.ts
export default async function handler(req, res) {
  const r = await fetch('http://localhost:8080/completion', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(req.body),
  });
  res.status(200).json(await r.json());
}

// components/ChatClient.tsx
'use client';
import { useState } from 'react';

export default function ChatClient() {
  const [res, setRes] = useState('');
  const ask = async () => {
    const r = await fetch('/api/chat', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ prompt: 'Write about AI.', n_predict: 128 }),
    });
    setRes((await r.json()).content);
  };
  return <div><button onClick={ask}>Ask</button><pre>{res}</pre></div>;
}