import requests
import os
import base64
import sys
invoke_url = "https://integrate.api.nvidia.com/v1/chat/completions"
stream = False
query = "Describe the scene"
kApiKey = "$NVIDIA_API_KEY"
# ext: {mime, media_type}
kSupportedList = {
"png": ["image/png", "image_url"],
"jpg": ["image/jpeg", "image_url"],
"jpeg": ["image/jpeg", "image_url"],
"webp": ["image/webp", "image_url"],
"mp4": ["video/mp4", "video_url"],
"webm": ["video/webm", "video_url"],
"mov": ["video/mov", "video_url"]
}
def get_extension(filename):
_, ext = os.path.splitext(filename)
ext = ext[1:].lower()
return ext
def mime_type(ext):
return kSupportedList[ext][0]
def media_type(ext):
return kSupportedList[ext][1]
def encode_media_base64(media_file):
"""Encode media file to base64 string"""
with open(media_file, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def chat_with_media(infer_url, media_files, query: str, stream: bool = False):
assert isinstance(media_files, list), f"{media_files}"
has_video = False
# Build content based on whether we have media files
if len(media_files) == 0:
# Text-only mode
content = query
else:
# Build content array with text and media
content = [{"type": "text", "text": query}]
for media_file in media_files:
ext = get_extension(media_file)
assert ext in kSupportedList, f"{media_file} format is not supported"
media_type_key = media_type(ext)
if media_type_key == "video_url":
has_video = True
print(f"Encoding {media_file} as base64...")
base64_data = encode_media_base64(media_file)
# Add media to content array
media_obj = {
"type": media_type_key,
media_type_key: {
"url": f"data:{mime_type(ext)};base64,{base64_data}"
}
}
content.append(media_obj)
if has_video:
assert len(media_files) == 1, "Only single video supported."
headers = {
"Authorization": f"Bearer {kApiKey}",
"Content-Type": "application/json",
"Accept": "application/json",
}
if stream:
headers["Accept"] = "text/event-stream"
# Add system message with appropriate prompt
# Videos only support /no_think, images support both
system_prompt = "/no_think" if has_video else "/think"
messages = [
{
"role": "system",
"content": system_prompt,
},
{
"role": "user",
"content": content,
}
]
payload = {
"max_tokens": 4096,
"temperature": 1,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0,
"messages": messages,
"stream": stream,
"model": "nvidia/nemotron-nano-12b-v2-vl",
}
response = requests.post(infer_url, headers=headers, json=payload, stream=stream)
if stream:
for line in response.iter_lines():
if line:
print(line.decode("utf-8"))
else:
print(response.json())
if __name__ == "__main__":
""" Usage:
python test.py # Text-only
python test.py sample.mp4 # Single video
python test.py sample1.png sample2.png # Multiple images
"""
media_samples = list(sys.argv[1:])
chat_with_media(invoke_url, media_samples, query, stream)Follow the steps below to download and run the NVIDIA NIM inference microservice for this model on your infrastructure of choice.
$ docker login nvcr.io
Username: $oauthtoken
Password: <PASTE_API_KEY_HERE>
Pull and run the NVIDIA NIM with the command below. This will download the optimized model for your infrastructure.
export NGC_API_KEY=<PASTE_API_KEY_HERE>
export LOCAL_NIM_CACHE=~/.cache/nim
mkdir -p "$LOCAL_NIM_CACHE"
docker run -it --rm \
--gpus all \
--shm-size=16GB \
-e NGC_API_KEY \
-v "$LOCAL_NIM_CACHE:/opt/nim/.cache" \
-u $(id -u) \
-p 8000:8000 \
nvcr.io/nim/nvidia/nemotron-nano-12b-v2-vl:latest
You can now make a local API call using this curl command:
curl -X 'POST' \
'http://0.0.0.0:8000/v1/chat/completions' \
-H 'Accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "nvidia/nemotron-nano-12b-v2-vl",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is in this image?"
},
{
"type": "image_url",
"image_url":
{
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1280px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
}
}
]
}
],
"max_tokens": 1024
}'
For more details on getting started with this NIM, visit the NVIDIA NIM Docs.