ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 1 | #' Obtains text embeddings for a single character (string) from the GPT-3 API |
| 2 | #' |
| 3 | #' @description |
ben-aaron188 | ca1c398 | 2022-09-22 15:15:53 +0200 | [diff] [blame] | 4 | #' `gpt3_single_embedding()` sends a single [embedding request](https://beta.openai.com/docs/guides/embeddings) to the Open AI GPT-3 API. |
ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 5 | #' @details The function supports the text similarity embeddings for the four GPT-3 models as specified in the parameter list. The main difference between the four models is the sophistication of the embedding representation as indicated by the vector embedding size. |
ben-aaron188 | 68434e4 | 2022-12-24 20:04:21 +0100 | [diff] [blame] | 6 | #' - Second-generation embeddings model `text-embedding-ada-002` (1536 dimensions) |
ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 7 | #' - Ada (1024 dimensions) |
| 8 | #' - Babbage (2048 dimensions) |
| 9 | #' - Curie (4096 dimensions) |
| 10 | #' - Davinci (12288 dimensions) |
| 11 | #' |
| 12 | #' Note that the dimension size (= vector length), speed and [associated costs](https://openai.com/api/pricing/) differ considerably. |
| 13 | #' |
| 14 | #' These vectors can be used for downstream tasks such as (vector) similarity calculations. |
| 15 | #' @param input character that contains the text for which you want to obtain text embeddings from the GPT-3 model |
ben-aaron188 | 68434e4 | 2022-12-24 20:04:21 +0100 | [diff] [blame] | 16 | #' @param model a character vector that indicates the [similarity embedding model](https://beta.openai.com/docs/guides/embeddings/similarity-embeddings); one of "text-embedding-ada-002" (default), "text-similarity-ada-001", "text-similarity-curie-001", "text-similarity-babbage-001", "text-similarity-davinci-001". Note: it is strongly recommend to use the faster, cheaper and higher quality second generation embeddings model "text-embedding-ada-002". |
ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 17 | #' @return A numeric vector (= the embedding vector) |
| 18 | #' @examples |
| 19 | #' # First authenticate with your API key via `gpt3_authenticate('pathtokey')` |
| 20 | #' |
| 21 | #' # Once authenticated: |
| 22 | #' |
| 23 | #' ## Simple request with defaults: |
| 24 | #' sample_string = "London is one of the most liveable cities in the world. The city is always full of energy and people. It's always a great place to explore and have fun." |
ben-aaron188 | ca1c398 | 2022-09-22 15:15:53 +0200 | [diff] [blame] | 25 | #' gpt3_single_embedding(input = sample_string) |
ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 26 | #' |
| 27 | #' ## Change the model: |
ben-aaron188 | 60f638b | 2023-03-15 23:33:56 +0100 | [diff] [blame] | 28 | #' gpt3_single_embedding(input = sample_string |
| 29 | #' , model = 'text-similarity-curie-001') |
ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 30 | #' @export |
ben-aaron188 | ca1c398 | 2022-09-22 15:15:53 +0200 | [diff] [blame] | 31 | gpt3_single_embedding = function(input |
ben-aaron188 | 68434e4 | 2022-12-24 20:04:21 +0100 | [diff] [blame] | 32 | , model = 'text-embedding-ada-002' |
ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 33 | ){ |
ben-aaron188 | 3818e7c | 2022-09-08 17:49:01 +0200 | [diff] [blame] | 34 | |
ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 35 | parameter_list = list(model = model |
| 36 | , input = input) |
ben-aaron188 | 3818e7c | 2022-09-08 17:49:01 +0200 | [diff] [blame] | 37 | |
| 38 | request_base = httr::POST(url = url.embeddings |
| 39 | , body = parameter_list |
| 40 | , httr::add_headers(Authorization = paste("Bearer", api_key)) |
| 41 | , encode = "json") |
| 42 | |
| 43 | |
| 44 | output_base = httr::content(request_base) |
| 45 | |
ben-aaron188 | 287b30b | 2022-09-11 16:46:37 +0200 | [diff] [blame] | 46 | embedding_raw = to_numeric(unlist(output_base$data[[1]]$embedding)) |
ben-aaron188 | 3818e7c | 2022-09-08 17:49:01 +0200 | [diff] [blame] | 47 | |
| 48 | return(embedding_raw) |
| 49 | |
| 50 | } |