Path parameters
-
The type of the inference task that the model will perform. NOTE: The
chat_completiontask type only supports streaming and only through the _stream API.Values are
chat_completion,completion,rerank, ortext_embedding. -
The unique identifier of the inference endpoint.
Query parameters
-
Specifies the amount of time to wait for the inference endpoint to be created.
External documentation
Body
Required
-
The chunking configuration object. Applies only to the
text_embeddingtask type. Not applicable to thererank,completion, orchat_completiontask types.External documentation -
The type of service supported for the specified task type. In this case,
nvidia.Value is
nvidia. -
Settings used to install the inference model. These settings are specific to the
nvidiaservice. -
Settings to configure the inference task. Applies only to the
text_embeddingtask type. Not applicable to thererank,completion, orchat_completiontask types. These settings are specific to the task type you specified.
PUT _inference/text_embedding/nvidia-text-embedding
{
"service": "nvidia",
"service_settings": {
"url": "nvidia-embeddings-url",
"api_key": "nvidia-embeddings-token",
"model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2"
}
}
resp = client.inference.put(
task_type="text_embedding",
inference_id="nvidia-text-embedding",
inference_config={
"service": "nvidia",
"service_settings": {
"url": "nvidia-embeddings-url",
"api_key": "nvidia-embeddings-token",
"model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2"
}
},
)
const response = await client.inference.put({
task_type: "text_embedding",
inference_id: "nvidia-text-embedding",
inference_config: {
service: "nvidia",
service_settings: {
url: "nvidia-embeddings-url",
api_key: "nvidia-embeddings-token",
model_id: "nvidia/llama-3.2-nv-embedqa-1b-v2",
},
},
});
response = client.inference.put(
task_type: "text_embedding",
inference_id: "nvidia-text-embedding",
body: {
"service": "nvidia",
"service_settings": {
"url": "nvidia-embeddings-url",
"api_key": "nvidia-embeddings-token",
"model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2"
}
}
)
$resp = $client->inference()->put([
"task_type" => "text_embedding",
"inference_id" => "nvidia-text-embedding",
"body" => [
"service" => "nvidia",
"service_settings" => [
"url" => "nvidia-embeddings-url",
"api_key" => "nvidia-embeddings-token",
"model_id" => "nvidia/llama-3.2-nv-embedqa-1b-v2",
],
],
]);
curl -X PUT -H "Authorization: ApiKey $ELASTIC_API_KEY" -H "Content-Type: application/json" -d '{"service":"nvidia","service_settings":{"url":"nvidia-embeddings-url","api_key":"nvidia-embeddings-token","model_id":"nvidia/llama-3.2-nv-embedqa-1b-v2"}}' "$ELASTICSEARCH_URL/_inference/text_embedding/nvidia-text-embedding"
{
"service": "nvidia",
"service_settings": {
"url": "nvidia-embeddings-url",
"api_key": "nvidia-embeddings-token",
"model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2"
}
}
{
"service": "nvidia",
"service_settings": {
"model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2",
"api_key": "nvidia-text-embeddings-token"
},
"task_settings": {
"input_type": "ingest",
"truncate": "start"
}
}
{
"service": "nvidia",
"service_settings": {
"url": "nvidia-completion-url",
"api_key": "nvidia-completion-token",
"model_id": "microsoft/phi-3-mini-128k-instruct"
}
}
{
"service": "nvidia",
"service_settings": {
"api_key": "nvidia-completion-token",
"model_id": "microsoft/phi-3-mini-128k-instruct"
}
}
{
"service": "nvidia",
"service_settings": {
"url": "nvidia-chat-completion-url",
"api_key": "nvidia-chat-completion-token",
"model_id": "microsoft/phi-3-mini-128k-instruct"
}
}
{
"service": "nvidia",
"service_settings": {
"api_key": "nvidia-chat-completion-token",
"model_id": "microsoft/phi-3-mini-128k-instruct"
}
}
{
"service": "nvidia",
"service_settings": {
"url": "nvidia-rerank-url",
"api_key": "nvidia-rerank-token",
"model_id": "nv-rerank-qa-mistral-4b:1"
}
}
{
"service": "nvidia",
"service_settings": {
"api_key": "nvidia-rerank-token",
"model_id": "nv-rerank-qa-mistral-4b:1"
}
}
{
"inference_id": "nvidia-text-embedding",
"task_type": "text_embedding",
"service": "nvidia",
"service_settings": {
"model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2",
"url": "nvidia-embeddings-url",
"rate_limit": {
"requests_per_minute": 3000
},
"dimensions": 2048,
"similarity": "dot_product"
},
"chunking_settings": {
"strategy": "sentence",
"max_chunk_size": 250,
"sentence_overlap": 1
}
}
{
"inference_id": "nvidia-text-embedding",
"task_type": "text_embedding",
"service": "nvidia",
"service_settings": {
"model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2",
"rate_limit": {
"requests_per_minute": 3000
},
"dimensions": 2048,
"similarity": "dot_product"
},
"task_settings": {
"input_type": "ingest",
"truncate": "start"
},
"chunking_settings": {
"strategy": "sentence",
"max_chunk_size": 250,
"sentence_overlap": 1
}
}
{
"inference_id": "nvidia-completion",
"task_type": "completion",
"service": "nvidia",
"service_settings": {
"model_id": "microsoft/phi-3-mini-128k-instruct",
"url": "nvidia-completion-url",
"rate_limit": {
"requests_per_minute": 3000
}
}
}
{
"inference_id": "nvidia-completion",
"task_type": "completion",
"service": "nvidia",
"service_settings": {
"model_id": "microsoft/phi-3-mini-128k-instruct",
"rate_limit": {
"requests_per_minute": 3000
}
}
}
{
"inference_id": "nvidia-chat-completion",
"task_type": "chat_completion",
"service": "nvidia",
"service_settings": {
"model_id": "microsoft/phi-3-mini-128k-instruct",
"url": "nvidia-chat-completion-url",
"rate_limit": {
"requests_per_minute": 3000
}
}
}
{
"inference_id": "nvidia-chat-completion",
"task_type": "chat_completion",
"service": "nvidia",
"service_settings": {
"model_id": "microsoft/phi-3-mini-128k-instruct",
"rate_limit": {
"requests_per_minute": 3000
}
}
}
{
"inference_id": "nvidia-rerank",
"task_type": "rerank",
"service": "nvidia",
"service_settings": {
"model_id": "nv-rerank-qa-mistral-4b:1",
"url": "nvidia-rerank-url",
"rate_limit": {
"requests_per_minute": 3000
}
}
}
{
"inference_id": "nvidia-rerank",
"task_type": "rerank",
"service": "nvidia",
"service_settings": {
"model_id": "nv-rerank-qa-mistral-4b:1",
"rate_limit": {
"requests_per_minute": 3000
}
}
}