From real-time inference to large-scale training, Velar handles any GPU workload.
Deploy large language models like Llama, Mistral, and GPT-J for real-time text generation. Velar handles GPU allocation, scaling, and load balancing so you can focus on building your application.
import velar
app = velar.App("llm-serving")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("vllm")
@app.function(gpu="A100", image=image)
def chat(prompt: str, max_tokens: int = 512):
from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
params = SamplingParams(max_tokens=max_tokens)
return llm.generate([prompt], params)[0].outputs[0].textRun Stable Diffusion, SDXL, and video generation models at scale. Generate images in milliseconds with warm GPU pools and automatic batching for high-throughput workloads.
import velar
app = velar.App("image-gen")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("diffusers", "transformers", "accelerate")
@app.function(gpu="A10", image=image)
def generate_image(prompt: str, steps: int = 30):
from diffusers import StableDiffusionXLPipeline
import torch
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
return pipe(prompt, num_inference_steps=steps).images[0]Fine-tune foundation models on your own data with managed training jobs. Velar provisions multi-GPU clusters, handles checkpointing, and streams training metrics in real time.
import velar
app = velar.App("fine-tune")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("transformers", "peft", "datasets", "trl")
@app.function(gpu="A100", timeout=3600, image=image)
def train(dataset_path: str, base_model: str):
from transformers import AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset
model = AutoModelForCausalLM.from_pretrained(base_model)
dataset = load_dataset("json", data_files=dataset_path)
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
args=TrainingArguments(output_dir="./output", num_train_epochs=3),
)
trainer.train()Process large datasets with GPU-accelerated batch jobs. Velar automatically parallelizes work across multiple GPUs and handles retries, making it ideal for embeddings, transcription, and data pipelines.
import velar
app = velar.App("batch-embed")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("sentence-transformers")
@app.function(gpu="L4", image=image)
def embed_batch(texts: list[str]):
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BAAI/bge-large-en-v1.5")
return model.encode(texts, batch_size=64).tolist()
# Process 100k documents in parallel
@app.local_entrypoint()
def main():
documents = load_documents() # your data
chunks = [documents[i:i+100] for i in range(0, len(documents), 100)]
results = [embed_batch.remote(chunk) for chunk in chunks]