Talk to Your Data: A UDF That Speaks Your Language

SELECT u.name, COUNT(o.id) AS total_orders
FROM `commerce`.sales.users AS u
JOIN `commerce`.sales.orders AS o ON u.id = o.user_id
WHERE o.status = "completed"
AND DATE_DIFF_STR(NOW_STR(), o.order_date, "day") <= 30
GROUP BY u.name
ORDER BY total_orders DESC
LIMIT 5;

SELECT u.name, COUNT(o.id) AS total_orders

FROM `commerce`.sales.users AS u

JOIN `commerce`.sales.orders AS o ON u.id = o.user_id

WHERE o.status = "completed"

AND DATE_DIFF_STR(NOW_STR(), o.order_date, "day") <= 30

GROUP BY u.name

ORDER BY total_orders DESC

LIMIT 5;

위의 쿼리는 지난 30일 동안 가장 많은 주문을 완료한 상위 5명의 사용자에 대한 Couchbase에 저장된 데이터에서 귀중한 인사이트를 제공합니다. 하지만 고급 SQL++ 개발자가 아닌데 오후 11시까지 보고서를 위한 답변이 필요하다면 어떻게 해야 할까요? 그런 다음 개발자가 SQL++ 쿼리를 작성하여 답변을 얻을 때까지 기다려야 합니다.

또는 다음과 같은 질문을 해결하기 위해 임시 디버깅을 수행해야 하는 경우를 생각해 보세요:

주문이 배송된 날짜가 누락된 문서가 있나요?
주문이 취소되었다는 뜻인가요? 아니면 주문이 잘못 접수되어 주문이 배송되지 않았나요? 아니면 모든 것이 정상인데 필드에 주문_배송됨 값을 추가하는 것을 놓쳤나요?

이 경우 주문_배송됨 필드를 검색할 뿐만 아니라 주문_취소됨을 확인하거나 댓글을 조사하여 잘못 배송되었는지 등을 파악해야 합니다. 따라서 작성해야 하는 쿼리는 간단하거나 간단하지 않습니다.

SELECT 
    o.orderId,
    o.orderDate,
    o.order_cancelled,
    o.order_delivered,
    o.comments,
    
    CASE
        WHEN o.order_cancelled = TRUE THEN "Order was cancelled"
        WHEN ANY c IN o.comments SATISFIES LOWER(c) LIKE "%misplac%" 
                                    OR LOWER(c) LIKE "%lost%" THEN "Order may have been misplaced"
        WHEN ANY c IN o.comments SATISFIES LOWER(c) LIKE "%deliver%" THEN "Delivered but field not updated"
        ELSE "Reason unknown — investigate"
    END AS reason
FROM `commerce`.`sales`.`orders` AS o
WHERE o.order_delivered IS MISSING OR o.order_delivered IS NULL;

SELECT

o.orderId,

o.orderDate,

o.order_cancelled,

o.order_delivered,

o.comments,

CASE

WHEN o.order_cancelled = TRUE THEN "Order was cancelled"

WHEN ANY c IN o.comments SATISFIES LOWER(c) LIKE "%misplac%"

OR LOWER(c) LIKE "%lost%" THEN "Order may have been misplaced"

WHEN ANY c IN o.comments SATISFIES LOWER(c) LIKE "%deliver%" THEN "Delivered but field not updated"

ELSE "Reason unknown — investigate"

END AS reason

FROM `commerce`.`sales`.`orders` AS o

WHERE o.order_delivered IS MISSING OR o.order_delivered IS NULL;

이런 경우 24시간 연중무휴로 이 모든 답변을 얻을 수 있는 믿을 수 있는 조력자가 있다면 도움이 될 것입니다. 이 블로그에서 설명하는 UDF가 바로 그러한 어시스턴트입니다. 가장 자연스러운 방식으로 질문을 받아들이고 결과를 JSON으로 반환합니다. 그 뒤에는 API 키와 함께 사용자가 선택한 모델에 연결하여 사용자의 생각을 SQL++로 변환한 다음 실행합니다. 이 어시스턴트를 호출하는 데 필요한 것은 UDF를 사용하기만 하면 됩니다.

SELECT NL2SQL(
  [“`commerce`.`sales`.`orders`”],
  "Are there any documents where the order_delivered date is missing?and if so why?",
  "",
  "https://api.openai.com/v1/chat/completions",
  "gpt-4o-2024-05-13"
) ;

SELECT NL2SQL(

[“`commerce`.`sales`.`orders`”],

"Are there any documents where the order_delivered date is missing?and if so why?",

"",

"https://api.openai.com/v1/chat/completions",

"gpt-4o-2024-05-13"

) ;

작동 방식

1. 라이브러리를 설정합니다.
먼저 UDF에서 사용하는 자바스크립트 라이브러리를 생성합니다.

라이브러리:

/*
input:

keyspaces: an array of strings, each string represents a keyspaces "bucket.scope.collection" with proper escaping using grave-accent quote wherever required
prompt: users natural language request
apikey: your openai api key
model: string representing the model's name, see https://platform.openai.com/docs/api-reference/completions/create#completions-create-model for more details
 
output:
chat-completions api response with the generated sql statement
*/
 
 
 
function inferencer(k) {
   var infq = N1QL("SELECT t.properties FROM(INFER "+k+ ") as t") ;
   var res=[]
   for(const doc of infq) {
       res.push(doc)
   }
  
    return res[0];
}
 
 
function nl2sql(keyspaces, prompt, apikey, modelapi, model) {
  
   
   collectionSchema = {}
   for(const k in keyspaces) {
       c = inferencer(keyspaces[k])
       collectionSchema[keyspaces[k]] = c
   }
  
   collectionSchemaStr = JSON.stringify(collectionSchema)
  
   promptContent = `Information:\nCollection's schema: ${collectionSchemaStr}\n\nPrompt: \"${prompt}\"\n\nThe query context is set.\n\nBased on the above Information, write valid SQL++ and return only the statement and no explanation. For retrieval, use aliases. Use UNNEST from the FROM clause when appropriate. \n\nIf you're sure the Prompt can't be used to generate a query, first say \"#ERR:\" and then explain why not.`
  
    
    data = {"messages":[{"role":"system","content":"You are a Couchbase Capella expert. Your task is to create valid queries to retrieve or create data based on the provided Information.\n\nApproach this task step-by-step and take your time."},{"role":"user","content":promptContent}],
        "model": model,
        "temperature":0,
        "max_tokens":1024,
        "stream":false}

    
    
var dataStr = JSON.stringify(data)
    .replace(/\\/g, "\\\\")    // escape backslashes
    .replace(/"/g, '\\"');     // escape quotes
   
    var completionsurl = modelapi
  
   var q= `SELECT CURL("${completionsurl}", {
        "request": "POST",
        "header": ["Authorization: Bearer ${apikey}", "Content-type: application/json"],
        "data": "${dataStr}"
    }) AS result;`
   
    var completionsq = N1QL(q);
       
    
 
    var res = []
   for(const doc of completionsq) {
       res.push(doc);
     }
   
  try {
    content = res[0].result.choices[0].message.content    
  } catch(e) {
    return res;  
  } 
   
  stmt = content.trim().substring(7, content.length-4)
   
  isSelect = (stmt.substring(0,6).toLowerCase())==="select"
  if(isSelect === false){
      return {
          "generated_statement": stmt
      }
  }
  
  var runq = N1QL(stmt);
   
   var rrun = []
  for(const doc of runq) {
      rrun.push(doc)
  }
  
  return {
      "generated_statement": stmt,
      "results": rrun
  }
    
}

input:

keyspaces: an array of strings, each string represents a keyspaces "bucket.scope.collection" with proper escaping using grave-accent quote wherever required

prompt: users natural language request

apikey: your openai api key

model: string representing the model's name, see https://platform.openai.com/docs/api-reference/completions/create#completions-create-model for more details

output:

chat-completions api response with the generated sql statement

function inferencer(k) {

var infq = N1QL("SELECT t.properties FROM(INFER "+k+ ") as t") ;

var res=[]

for(const doc of infq) {

res.push(doc)

}

return res[0];

}

function nl2sql(keyspaces, prompt, apikey, modelapi, model) {

collectionSchema = {}

for(const k in keyspaces) {

c = inferencer(keyspaces[k])

collectionSchema[keyspaces[k]] = c

}

collectionSchemaStr = JSON.stringify(collectionSchema)

promptContent = `Information:\nCollection's schema: ${collectionSchemaStr}\n\nPrompt: \"${prompt}\"\n\nThe query context is set.\n\nBased on the above Information, write valid SQL++ and return only the statement and no explanation. For retrieval, use aliases. Use UNNEST from the FROM clause when appropriate. \n\nIf you're sure the Prompt can't be used to generate a query, first say \"#ERR:\" and then explain why not.`

data = {"messages":[{"role":"system","content":"You are a Couchbase Capella expert. Your task is to create valid queries to retrieve or create data based on the provided Information.\n\nApproach this task step-by-step and take your time."},{"role":"user","content":promptContent}],

"model": model,

"temperature":0,

"max_tokens":1024,

"stream":false}

var dataStr = JSON.stringify(data)

.replace(/\\/g, "\\\\") // escape backslashes

.replace(/"/g, '\\"'); // escape quotes

var completionsurl = modelapi

var q= `SELECT CURL("${completionsurl}", {

"request": "POST",

"header": ["Authorization: Bearer ${apikey}", "Content-type: application/json"],

"data": "${dataStr}"

}) AS result;`

var completionsq = N1QL(q);

var res = []

for(const doc of completionsq) {

res.push(doc);

}

try {

content = res[0].result.choices[0].message.content

} catch(e) {

return res;

}

stmt = content.trim().substring(7, content.length-4)

isSelect = (stmt.substring(0,6).toLowerCase())==="select"

if(isSelect === false){

return {

"generated_statement": stmt

}

var runq = N1QL(stmt);

var rrun = []

for(const doc of runq) {

rrun.push(doc)

}

return {

"generated_statement": stmt,

"results": rrun

}

2. 라이브러리를 업로드합니다.
제공된 라이브러리 코드를 파일에 복사한 후 curl 명령을 실행합니다(예:ailib.js 사용).

curl -X POST https://localhost:9499/evaluator/v1/libraries/usingailib 
--data-binary @usingailib.js -u Administrator:password

1 2	curl -X POST https://localhost:9499/evaluator/v1/libraries/usingailib --data-binary @usingailib.js -u Administrator:password

3. UDF를 생성합니다.
라이브러리를 생성한 후 아래의 create 함수 문을 사용하여 UDF를 생성합니다:

CREATE OR REPLACE FUNCTION NL2SQL(keyspaces, prompt, apikey, modelapi, model) 
LANGUAGE JAVASCRIPT AS "nl2sql" AT "usingailib";

1 2	CREATE OR REPLACE FUNCTION NL2SQL(keyspaces, prompt, apikey, modelapi, model) LANGUAGE JAVASCRIPT AS "nl2sql" AT "usingailib";

이제 NL2SQL()이 인간 언어와 Couchbase의 쿼리 엔진 사이에서 다국어 번역기 역할을 합니다. 컨텍스트와 자연어 요청을 제공하기만 하면 응답을 반환합니다.

UDF의 사고 방식

내부적으로는 UDF를 호출할 때 선호하는 모델을 사용하여 사용자의 의도를 파악하고 Couchbase가 실행할 수 있는 쿼리를 생성합니다.

채팅 완성 API를 사용하면 동일한 API 사양을 준수하는 다른 제공업체의 모델을 간단히 연결할 수 있다는 장점이 있습니다. 자체 비공개 LLM을 사용하거나 Open AI, Gemini, Claude 등의 알려진 LLM을 사용할 수 있습니다.

호출된 UDF를 사용하려면 다음 정보가 필요합니다:

키 스페이스 - 각각 카우치베이스 키공간(bucket.scope.collection)을 나타내는 문자열 배열로, 특수 이름을 이스케이프 처리하려면 필요한 경우 큰따옴표 따옴표를 사용합니다(예 여행 샘플.inventory.route). 이는 UDF에 데이터를 찾을 위치를 알려줍니다.
프롬프트 - 일반 영어(또는 다른 언어)로 요청합니다.
예시: 예: “지난 24시간 동안 구매한 모든 사용자 표시”
apikey - 모델 엔드포인트 인증에 사용되는 API 키입니다.
모델 엔드포인트 - 예: AI 호환 채팅 완료 URL 열기.
모델 - 공급업체에서 사용하려는 모델의 이름입니다.
예: “GPT-4O-2024-05-13”

라이브러리에는 몇 가지 사용 가능한 기능도 있습니다:

추론자()

쿼리를 생성하기 전에 UDF는 먼저 데이터를 이해하려고 시도합니다. inferencer() 도우미 함수는 Couchbase의 INFER 문을 호출하여 컬렉션의 스키마를 검색합니다:

function inferencer(k) {
   var infq = N1QL("SELECT t.properties FROM (INFER " + k + ") AS t");
   var res = [];
   for (const doc of infq) {
       res.push(doc);
   }
   return res[0];
}

function inferencer(k) {

var infq = N1QL("SELECT t.properties FROM (INFER " + k + ") AS t");

var res = [];

for (const doc of infq) {

res.push(doc);

}

return res[0];

}

이 스키마는 AI가 각 컬렉션에 어떤 종류의 데이터가 있는지 이해하는 데 사용됩니다.

주요 함수: nl2sql()

inferencer()를 사용하여 주어진 키스페이스에 대한 모든 스키마를 수집합니다. 추론된 스키마, 자연어 쿼리, LLM을 넛지하는 Couchbase 프롬프트가 포함된 프롬프트를 작성합니다.
LLM으로 전송합니다.
모델의 응답에서 생성된 SQL++를 추출합니다.
SELECT 문인 경우 직접 실행하고 생성된 SQL++ 문과 쿼리 결과를 모두 반환합니다.

비선택 문을 실행하지 않는 이유는 사용자가 확인하지 않고 컬렉션에 문서를 삽입, 업데이트 또는 삭제하는 것을 원하지 않기 때문입니다. 따라서 SQL++ 문을 사용하면 확인을 거친 후에 실행할 수 있습니다.

사용 사례 예시:

SELECT default:NL2SQL(
  ["`travel-sample`.inventory.hotel"],
  "Give me hotels in San Francisco that have free parking and free breakfast and a rating of more than 3",  "",
  "https://api.openai.com/v1/chat/completions",
  "gpt-4o-2024-05-13"
);

SELECT default:NL2SQL(

["`travel-sample`.inventory.hotel"],

"Give me hotels in San Francisco that have free parking and free breakfast and a rating of more than 3", "",

"https://api.openai.com/v1/chat/completions",

"gpt-4o-2024-05-13"

);

Result:
[{
    "$1": {
        "generated_statement": "SELECT h.name, h.address, h.city, h.state, h.country, h.free_parking, h.free_breakfast, r.ratings.Overall\nFROM `travel-sample`.inventory.hotel AS h\nUNNEST h.reviews AS r\nWHERE h.city = \"San Francisco\"\n  AND h.free_parking = true\n  AND h.free_breakfast = true\n  AND r.ratings.Overall > 3;",
        "results": [{
                "Overall": 4,
                "address": "520 Church St",
                "city": "San Francisco",
                "country": "United States",
                "free_breakfast": true,
                "free_parking": true,
                "name": "Parker House",
                "state": "California"
            },
            {
                "Overall": 4,
                "address": "520 Church St",
                "city": "San Francisco",
                "country": "United States",
                "free_breakfast": true,
                "free_parking": true,
                "name": "Parker House",
                "state": "California"
            },
            {
                "Overall": 5,
                "address": "520 Church St",
                "city": "San Francisco",
                "country": "United States",
                "free_breakfast": true,
                "free_parking": true,
                "name": "Parker House",
                "state": "California"
            },
            {
                "Overall": 4,
                "address": "520 Church St",
                "city": "San Francisco",
                "country": "United States",
                "free_breakfast": true,
                "free_parking": true,
                "name": "Parker House",
                "state": "California"
            },
            {
                "Overall": 5,
                "address": "465 Grant Ave",
                "city": "San Francisco",
                "country": "United States",
                "free_breakfast": true,
                "free_parking": true,
                "name": "Grant Plaza Hotel",
                "state": "California"
            },
            {
                "Overall": 5,
                "address": "465 Grant Ave",
                "city": "San Francisco",
                "country": "United States",
                "free_breakfast": true,
                "free_parking": true,
                "name": "Grant Plaza Hotel",
                "state": "California"
            },
...

Result:

[{

"$1": {

"generated_statement": "SELECT h.name, h.address, h.city, h.state, h.country, h.free_parking, h.free_breakfast, r.ratings.Overall\nFROM `travel-sample`.inventory.hotel AS h\nUNNEST h.reviews AS r\nWHERE h.city = \"San Francisco\"\n AND h.free_parking = true\n AND h.free_breakfast = true\n AND r.ratings.Overall > 3;",

"results": [{

"Overall": 4,

"address": "520 Church St",

"city": "San Francisco",

"country": "United States",

"free_breakfast": true,

"free_parking": true,

"name": "Parker House",

"state": "California"

{

"Overall": 4,

"address": "520 Church St",

"city": "San Francisco",

"country": "United States",

"free_breakfast": true,

"free_parking": true,

"name": "Parker House",

"state": "California"

{

"Overall": 5,

"address": "520 Church St",

"city": "San Francisco",

"country": "United States",

"free_breakfast": true,

"free_parking": true,

"name": "Parker House",

"state": "California"

{

"Overall": 4,

"address": "520 Church St",

"city": "San Francisco",

"country": "United States",

"free_breakfast": true,

"free_parking": true,

"name": "Parker House",

"state": "California"

{

"Overall": 5,

"address": "465 Grant Ave",

"city": "San Francisco",

"country": "United States",

"free_breakfast": true,

"free_parking": true,

"name": "Grant Plaza Hotel",

"state": "California"

{

"Overall": 5,

"address": "465 Grant Ave",

"city": "San Francisco",

"country": "United States",

"free_breakfast": true,

"free_parking": true,

"name": "Grant Plaza Hotel",

"state": "California"

...

다른 공급업체의 모델로 실험하기

다음 예제는 Gemini의 Open AI 호환 API를 사용합니다. 모델 공급자의 URL을 기존 Open AI API에서 Gemini의 API로 변경하기만 하면 됩니다. 또한 모델 파라미터를 인식할 수 있는 모델로 변경해야 합니다. 물론 api-키도 Open AI의 키에서 Gemini의 키로 업데이트해야 합니다.

SELECT NL2SQL(
  ["`travel-sample`.inventory.hotel"],
  "Show me hotels in France",
  "",
  "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
  "gemini-2.0-flash"
)as p;

SELECT NL2SQL(

["`travel-sample`.inventory.hotel"],

"Show me hotels in France",

"",

"https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",

"gemini-2.0-flash"

)as p;

결과는 다음과 같습니다:

[{
    "p": {
        "generated_statement": "SELECT h.name AS hotel_name, h.city AS hotel_city\nFROM `travel-sample`.inventory.hotel AS h\nWHERE h.country = \"France\";",
        "results": [{
                "hotel_city": "Giverny",
                "hotel_name": "The Robins"
            },
            {
                "hotel_city": "Giverny",
                "hotel_name": "Le Clos Fleuri"
            },
 	…
	     {
                "hotel_city": "Ferney-Voltaire",
                "hotel_name": "Hotel Formule 1"
            }
        ]
    }
}]

[{

"p": {

"generated_statement": "SELECT h.name AS hotel_name, h.city AS hotel_city\nFROM `travel-sample`.inventory.hotel AS h\nWHERE h.country = \"France\";",

"results": [{

"hotel_city": "Giverny",

"hotel_name": "The Robins"

{

"hotel_city": "Giverny",

"hotel_name": "Le Clos Fleuri"

…

{

"hotel_city": "Ferney-Voltaire",

"hotel_name": "Hotel Formule 1"

}

]

}

}]

결론

이 블로그에서는 AI를 활용하여 Couchbase에서 데이터와 상호 작용하는 방법을 소개합니다. 이 UDF를 사용하면 SQL++ 전문 지식 없이도 자연어 쿼리가 현실화됩니다. 모델에 구애받지 않으며 프로덕션 쿼리에도 안전합니다.

그리고 이것은 시작에 불과합니다. 향후에는 이를 더욱 확대할 계획입니다:

이미지 → SQL++
음성 → SQL++
에이전트와 유사한 파이프라인

... 모두 Couchbase 워크플로 내에서 실행됩니다.

참조
카펠라 IQ: https://docs.couchbase.com/cloud/get-started/capella-iq/get-started-with-iq.html
채팅 완료 API:
https://platform.openai.com/docs/api-reference/chat
https://ai.google.dev/gemini-api/docs/openai#rest

가우라브 자야라즈 - 소프트웨어 엔지니어

이 문서 공유하기

Platform

Self-Managed

Services

Capabilities

By Use Case

By Industry

Popular Docs

Quickstart

Resource Center

About

Partnerships

데이터와 대화하세요: 고객의 언어를 이해하는 UDF

작동 방식

UDF의 사고 방식

다른 공급업체의 모델로 실험하기

결론

받은 편지함에서 카우치베이스 블로그 업데이트 받기

작성자

게시자 가우라브 자야라즈 - 소프트웨어 엔지니어

댓글 남기기 응답 취소

카우치베이스 카펠라를 시작할 준비가 되셨나요?

구축 시작

카펠라 무료 사용

연락하기