embeddings functions

commit: 287b30b9fe6bf2907c38e8b787fce2b358b80b11 [log] [tgz]
author: ben-aaron188 <ben-aaron188@users.noreply.github.com> Sun Sep 11 16:46:37 2022 +0200
committer: ben-aaron188 <ben-aaron188@users.noreply.github.com> Sun Sep 11 16:46:37 2022 +0200
tree: cd93a28705ace49cf667cad00cd9d315fa914107
parent: 5bcd9116eb28c70dd5ecba750aa7e62083645266 [diff]
diff --git a/R/bunch_embedding.R b/R/bunch_embedding.R
index 29e3dc6..d22a3ac 100644
--- a/R/bunch_embedding.R
+++ b/R/bunch_embedding.R

@@ -1,11 +1,48 @@
-gpt3.bunch_embedding = function(data
-                                , text_var
+#' Retrieves text embeddings for character input from a vector from the GPT-3 API
+#'
+#' @description
+#' `gpt3_bunch_embedding()` extends the single embeddings function `gpt3_make_embedding()` to allow for the processing of a whole vector
+#' @details The returned data.table contains the column `id` which indicates the text id (or its generic alternative if not specified) and the columns `dim_1` ... `dim_{max}`, where `max` is the length of the text embeddings vector that the four different models return. For the default "Ada" model, these are 1024 dimensions (i.e., `dim_1`... `dim_1024`).
+#'
+#' The function supports the text similarity embeddings for the four GPT-3 models as specified in the parameter list. The main difference between the four models is the sophistication of the embedding representation as indicated by the vector embedding size.
+#'   - Ada (1024 dimensions)
+#'   - Babbage (2048 dimensions)
+#'   - Curie (4096 dimensions)
+#'   - Davinci (12288 dimensions)
+#'
+#' Note that the dimension size (= vector length), speed and [associated costs](https://openai.com/api/pricing/) differ considerably.
+#'
+#' These vectors can be used for downstream tasks such as (vector) similarity calculations.
+#' @param input_var character vector that contains the texts for which you want to obtain text embeddings from the GPT-3 model
+#' #' @param id_var (optional) character vector that contains the user-defined ids of the prompts. See details.
+#' @param param_model a character vector that indicates the [similarity embedding model](https://beta.openai.com/docs/guides/embeddings/similarity-embeddings); one of "text-similarity-ada-001" (default), "text-similarity-curie-001", "text-similarity-babbage-001", "text-similarity-davinci-001"
+#' @return A data.table with the embeddings as separate columns; one row represents one input text. See details.
+#' @examples
+#' # First authenticate with your API key via `gpt3_authenticate('pathtokey')`
+#'
+#' # Use example data:
+#' ## The data below were generated with the `gpt3_make_request()` function as follows:
+#' ##### DO NOT RUN #####
+#' # travel_blog_data = gpt3_make_request(prompt_input = "Write a travel blog about a dog's journey through the UK:", temperature = 0.8, n = 10, max_tokens = 200)[[1]]
+#' ##### END DO NOT RUN #####
+#'
+#' # You can load these data with:
+#' data("travel_blog_data") # the dataset contains 10 completions for the above request
+#'
+#' ## Obtain text embeddings for the completion texts:
+#' emb_travelblogs = gpt3_bunch_embedding(input_var = travel_blog_data$gpt3)
+#' dim(emb_travelblogs)
+#' @export
+gpt3_bunch_embedding = function(input_var
                                 , id_var
                                 , param_model = 'text-similarity-ada-001'){
 
-  data_ = data
-
-  data_length = data_[, .N]
+  data_length = length(input_var)
+  if(missing(id_var)){
+    data_id = paste0('prompt_', 1:data_length)
+  } else {
+    data_id = id_var
+  }
 
   empty_list = list()
 
@@ -13,12 +50,12 @@
 
     print(paste0('Embedding: ', i, '/', data_length))
 
-    row_outcome = gpt3.make_embedding(model_ = param_model
-                                      , input_ = as.character(unname(data_[i, ..text_var])))
+    row_outcome = gpt3_make_embedding(model = param_model
+                                      , input = input_var[i])
 
     empty_df = data.frame(t(row_outcome))
     names(empty_df) = paste0('dim_', 1:length(row_outcome))
-    empty_df$id_full = as.character(unname(data_[i, ..id_var]))
+    empty_df$id = data_id[i]
 
     empty_list[[i]] = empty_df
 

diff --git a/R/bunch_request.R b/R/bunch_request.R
index 398a7aa..dc9a7f2 100644
--- a/R/bunch_request.R
+++ b/R/bunch_request.R

@@ -1,12 +1,12 @@
 #' Makes bunch completion requests to the GPT-3 API
 #'
 #' @description
-#' `gpt3_bunch_request()` is the package's main function for rquests and takes as input a vector of prompts and processes each prompt as per the defined parameters. It extends the `gpt3_simple_request()` function to allow for bunch processing of requests to the Open AI GPT-3 API.
+#' `gpt3_bunch_request()` is the package's main function for rquests and takes as input a vector of prompts and processes each prompt as per the defined parameters. It extends the `gpt3_make_request()` function to allow for bunch processing of requests to the Open AI GPT-3 API.
 #' @details
 #' The easiest (and intended) use case for this function is to create a data.frame or data.table with variables that contain the prompts to be requested from GPT-3 and a prompt id (see examples below).
 #' For a general guide on the completion requests, see [https://beta.openai.com/docs/guides/completion](https://beta.openai.com/docs/guides/completion). This function provides you with an R wrapper to send requests with the full range of request parameters as detailed on [https://beta.openai.com/docs/api-reference/completions](https://beta.openai.com/docs/api-reference/completions) and reproduced below.
 #'
-#' For the `best_of` parameter: The `gpt3_simple_request()` (which is used here in a vectorised manner) handles the issue that best_of must be greater than n by setting if(best_of <= n){ best_of = n}.
+#' For the `best_of` parameter: The `gpt3_make_request()` (which is used here in a vectorised manner) handles the issue that best_of must be greater than n by setting `if(best_of <= n){ best_of = n}`.
 #'
 #' If `id_var` is not provided, the function will use `prompt_1` ... `prompt_n` as id variable.
 #'
@@ -93,7 +93,7 @@
 
     print(paste0('Request: ', i, '/', data_length))
 
-    row_outcome = gpt3_simple_request(prompt_input = prompt_var[i]
+    row_outcome = gpt3_make_request(prompt_input = prompt_var[i]
                                       , model = param_model
                                       , output_type = 'complete'
                                       , suffix = param_suffix

diff --git a/R/make_embedding.R b/R/make_embedding.R
index afdb610..a916ac6 100644
--- a/R/make_embedding.R
+++ b/R/make_embedding.R

@@ -1,8 +1,38 @@
-gpt3.make_embedding = function(model_ = 'text-similarity-ada-001'
-                               , input_){
+#' Obtains text embeddings for a single character (string) from the GPT-3 API
+#'
+#' @description
+#' `gpt3_make_embedding()` sends a single [embedding request](https://beta.openai.com/docs/guides/embeddings) to the Open AI GPT-3 API.
+#' @details The function supports the text similarity embeddings for the four GPT-3 models as specified in the parameter list. The main difference between the four models is the sophistication of the embedding representation as indicated by the vector embedding size.
+#'   - Ada (1024 dimensions)
+#'   - Babbage (2048 dimensions)
+#'   - Curie (4096 dimensions)
+#'   - Davinci (12288 dimensions)
+#'
+#' Note that the dimension size (= vector length), speed and [associated costs](https://openai.com/api/pricing/) differ considerably.
+#'
+#' These vectors can be used for downstream tasks such as (vector) similarity calculations.
+#' @param input character that contains the text for which you want to obtain text embeddings from the GPT-3 model
+#' @param model a character vector that indicates the [similarity embedding model](https://beta.openai.com/docs/guides/embeddings/similarity-embeddings); one of "text-similarity-ada-001" (default), "text-similarity-curie-001", "text-similarity-babbage-001", "text-similarity-davinci-001"
+#' @return A numeric vector (= the embedding vector)
+#' @examples
+#' # First authenticate with your API key via `gpt3_authenticate('pathtokey')`
+#'
+#' # Once authenticated:
+#'
+#' ## Simple request with defaults:
+#' sample_string = "London is one of the most liveable cities in the world. The city is always full of energy and people. It's always a great place to explore and have fun."
+#' gpt3_make_embedding(input = sample_string)
+#'
+#' ## Change the model:
+#' #' gpt3_make_embedding(input = sample_string
+#'     , model = 'text-similarity-curie-001')
+#' @export
+gpt3_make_embedding = function(input
+                               , model = 'text-similarity-ada-001'
+                               ){
 
-  parameter_list = list(model = model_
-                        , input = input_)
+  parameter_list = list(model = model
+                        , input = input)
 
   request_base = httr::POST(url = url.embeddings
                             , body = parameter_list
@@ -12,7 +42,7 @@
 
   output_base = httr::content(request_base)
 
-  embedding_raw = toNumeric(unlist(output_base$data[[1]]$embedding))
+  embedding_raw = to_numeric(unlist(output_base$data[[1]]$embedding))
 
   return(embedding_raw)
 

diff --git a/R/make_request.R b/R/make_request.R
index 3fb02e0..7fa653a 100644
--- a/R/make_request.R
+++ b/R/make_request.R

@@ -1,7 +1,7 @@
 #' Makes a single completion request to the GPT-3 API
 #'
 #' @description
-#' `gpt3_simple_request()` sends a single [completion request](https://beta.openai.com/docs/api-reference/completions) to the Open AI GPT-3 API.
+#' `gpt3_make_request()` sends a single [completion request](https://beta.openai.com/docs/api-reference/completions) to the Open AI GPT-3 API.
 #' @details For a general guide on the completion requests, see [https://beta.openai.com/docs/guides/completion](https://beta.openai.com/docs/guides/completion). This function provides you with an R wrapper to send requests with the full range of request parameters as detailed on [https://beta.openai.com/docs/api-reference/completions](https://beta.openai.com/docs/api-reference/completions) and reproduced below.
 #'
 #' For the `best_of` parameter: When used with n, best_of controls the number of candidate completions and n specifies how many to return – best_of must be greater than n. Note that this is handled by the wrapper automatically   if(best_of <= n){ best_of = n}.
@@ -36,26 +36,26 @@
 #' # Once authenticated:
 #'
 #' ## Simple request with defaults:
-#' gpt3_simple_request(prompt_input = 'How old are you?')
+#' gpt3_make_request(prompt_input = 'How old are you?')
 #'
 #' ## Instruct GPT-3 to write ten research ideas of max. 150 tokens with some controls:
-#'gpt3_simple_request(prompt_input = 'Write a research idea about using text data to understand human behaviour:'
+#'gpt3_make_request(prompt_input = 'Write a research idea about using text data to understand human behaviour:'
 #'    , temperature = 0.8
 #'    , n = 10
 #'    , max_tokens = 150)
 #'
 #' ## For fully reproducible results, we need `temperature = 0`, e.g.:
-#' gpt3_simple_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
+#' gpt3_make_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
 #'     , temperature = 0.0
 #'     , max_tokens = 50)
 #'
 #' ## The same example with a different GPT-3 model:
-#' gpt3_simple_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
+#' gpt3_make_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
 #'     , model = 'text-babbage-001'
 #'     , temperature = 0.0
 #'     , max_tokens = 50)
 #' @export
-gpt3_simple_request = function(prompt_input
+gpt3_make_request = function(prompt_input
                               , model = 'text-davinci-002'
                               , output_type = 'complete'
                               , suffix = NULL

diff --git a/R/test_request.R b/R/test_request.R
index dbc3160..7b8c7e4 100644
--- a/R/test_request.R
+++ b/R/test_request.R

@@ -12,7 +12,7 @@
   check_apikey_form()
 
   test_prompt = 'Write a story about R Studio:'
-  test_outout = gpt3_simple_request(prompt_ = test_prompt
+  test_outout = gpt3_make_request(prompt_ = test_prompt
                                   , max_tokens = 100)
   print(paste0('.. test successful ..'))
 

diff --git a/R/utils.R b/R/utils.R
new file mode 100644
index 0000000..5c771d2
--- /dev/null
+++ b/R/utils.R

@@ -0,0 +1,14 @@
+#' Convert character vector of numeric values into a numeric vector
+#'
+#' @description
+#' Converts a character vector of numeric values into a numeric vector
+#' @param x a character vector of numeric values
+#'
+#' @return A numeric vector
+#' @examples
+#' to_numeric('12312')
+#' @export
+to_numeric = function(x){
+  proxy = as.numeric(as.character(x))
+  return(proxy)
+}
commit	287b30b9fe6bf2907c38e8b787fce2b358b80b11	[log] [tgz]
author	ben-aaron188 <ben-aaron188@users.noreply.github.com>	Sun Sep 11 16:46:37 2022 +0200
committer	ben-aaron188 <ben-aaron188@users.noreply.github.com>	Sun Sep 11 16:46:37 2022 +0200
tree	cd93a28705ace49cf667cad00cd9d315fa914107
parent	5bcd9116eb28c70dd5ecba750aa7e62083645266 [diff]