embeddings functions

commit: 287b30b9fe6bf2907c38e8b787fce2b358b80b11 [log] [tgz]
author: ben-aaron188 <ben-aaron188@users.noreply.github.com> Sun Sep 11 16:46:37 2022 +0200
committer: ben-aaron188 <ben-aaron188@users.noreply.github.com> Sun Sep 11 16:46:37 2022 +0200
tree: cd93a28705ace49cf667cad00cd9d315fa914107
parent: 5bcd9116eb28c70dd5ecba750aa7e62083645266 [diff]
diff --git a/NAMESPACE b/NAMESPACE
index 81ac3c8..9a80e08 100644
--- a/NAMESPACE
+++ b/NAMESPACE

@@ -2,6 +2,8 @@
 
 export(gpt3_authenticate)
 export(gpt3_bunch_request)
-export(gpt3_simple_request)
+export(gpt3_make_embedding)
+export(gpt3_make_request)
 export(gpt3_test_request)
+export(to_numeric)
 export(url.completions)

diff --git a/R/bunch_embedding.R b/R/bunch_embedding.R
index 29e3dc6..d22a3ac 100644
--- a/R/bunch_embedding.R
+++ b/R/bunch_embedding.R

@@ -1,11 +1,48 @@
-gpt3.bunch_embedding = function(data
-                                , text_var
+#' Retrieves text embeddings for character input from a vector from the GPT-3 API
+#'
+#' @description
+#' `gpt3_bunch_embedding()` extends the single embeddings function `gpt3_make_embedding()` to allow for the processing of a whole vector
+#' @details The returned data.table contains the column `id` which indicates the text id (or its generic alternative if not specified) and the columns `dim_1` ... `dim_{max}`, where `max` is the length of the text embeddings vector that the four different models return. For the default "Ada" model, these are 1024 dimensions (i.e., `dim_1`... `dim_1024`).
+#'
+#' The function supports the text similarity embeddings for the four GPT-3 models as specified in the parameter list. The main difference between the four models is the sophistication of the embedding representation as indicated by the vector embedding size.
+#'   - Ada (1024 dimensions)
+#'   - Babbage (2048 dimensions)
+#'   - Curie (4096 dimensions)
+#'   - Davinci (12288 dimensions)
+#'
+#' Note that the dimension size (= vector length), speed and [associated costs](https://openai.com/api/pricing/) differ considerably.
+#'
+#' These vectors can be used for downstream tasks such as (vector) similarity calculations.
+#' @param input_var character vector that contains the texts for which you want to obtain text embeddings from the GPT-3 model
+#' #' @param id_var (optional) character vector that contains the user-defined ids of the prompts. See details.
+#' @param param_model a character vector that indicates the [similarity embedding model](https://beta.openai.com/docs/guides/embeddings/similarity-embeddings); one of "text-similarity-ada-001" (default), "text-similarity-curie-001", "text-similarity-babbage-001", "text-similarity-davinci-001"
+#' @return A data.table with the embeddings as separate columns; one row represents one input text. See details.
+#' @examples
+#' # First authenticate with your API key via `gpt3_authenticate('pathtokey')`
+#'
+#' # Use example data:
+#' ## The data below were generated with the `gpt3_make_request()` function as follows:
+#' ##### DO NOT RUN #####
+#' # travel_blog_data = gpt3_make_request(prompt_input = "Write a travel blog about a dog's journey through the UK:", temperature = 0.8, n = 10, max_tokens = 200)[[1]]
+#' ##### END DO NOT RUN #####
+#'
+#' # You can load these data with:
+#' data("travel_blog_data") # the dataset contains 10 completions for the above request
+#'
+#' ## Obtain text embeddings for the completion texts:
+#' emb_travelblogs = gpt3_bunch_embedding(input_var = travel_blog_data$gpt3)
+#' dim(emb_travelblogs)
+#' @export
+gpt3_bunch_embedding = function(input_var
                                 , id_var
                                 , param_model = 'text-similarity-ada-001'){
 
-  data_ = data
-
-  data_length = data_[, .N]
+  data_length = length(input_var)
+  if(missing(id_var)){
+    data_id = paste0('prompt_', 1:data_length)
+  } else {
+    data_id = id_var
+  }
 
   empty_list = list()
 
@@ -13,12 +50,12 @@
 
     print(paste0('Embedding: ', i, '/', data_length))
 
-    row_outcome = gpt3.make_embedding(model_ = param_model
-                                      , input_ = as.character(unname(data_[i, ..text_var])))
+    row_outcome = gpt3_make_embedding(model = param_model
+                                      , input = input_var[i])
 
     empty_df = data.frame(t(row_outcome))
     names(empty_df) = paste0('dim_', 1:length(row_outcome))
-    empty_df$id_full = as.character(unname(data_[i, ..id_var]))
+    empty_df$id = data_id[i]
 
     empty_list[[i]] = empty_df
 

diff --git a/R/bunch_request.R b/R/bunch_request.R
index 398a7aa..dc9a7f2 100644
--- a/R/bunch_request.R
+++ b/R/bunch_request.R

@@ -1,12 +1,12 @@
 #' Makes bunch completion requests to the GPT-3 API
 #'
 #' @description
-#' `gpt3_bunch_request()` is the package's main function for rquests and takes as input a vector of prompts and processes each prompt as per the defined parameters. It extends the `gpt3_simple_request()` function to allow for bunch processing of requests to the Open AI GPT-3 API.
+#' `gpt3_bunch_request()` is the package's main function for rquests and takes as input a vector of prompts and processes each prompt as per the defined parameters. It extends the `gpt3_make_request()` function to allow for bunch processing of requests to the Open AI GPT-3 API.
 #' @details
 #' The easiest (and intended) use case for this function is to create a data.frame or data.table with variables that contain the prompts to be requested from GPT-3 and a prompt id (see examples below).
 #' For a general guide on the completion requests, see [https://beta.openai.com/docs/guides/completion](https://beta.openai.com/docs/guides/completion). This function provides you with an R wrapper to send requests with the full range of request parameters as detailed on [https://beta.openai.com/docs/api-reference/completions](https://beta.openai.com/docs/api-reference/completions) and reproduced below.
 #'
-#' For the `best_of` parameter: The `gpt3_simple_request()` (which is used here in a vectorised manner) handles the issue that best_of must be greater than n by setting if(best_of <= n){ best_of = n}.
+#' For the `best_of` parameter: The `gpt3_make_request()` (which is used here in a vectorised manner) handles the issue that best_of must be greater than n by setting `if(best_of <= n){ best_of = n}`.
 #'
 #' If `id_var` is not provided, the function will use `prompt_1` ... `prompt_n` as id variable.
 #'
@@ -93,7 +93,7 @@
 
     print(paste0('Request: ', i, '/', data_length))
 
-    row_outcome = gpt3_simple_request(prompt_input = prompt_var[i]
+    row_outcome = gpt3_make_request(prompt_input = prompt_var[i]
                                       , model = param_model
                                       , output_type = 'complete'
                                       , suffix = param_suffix

diff --git a/R/make_embedding.R b/R/make_embedding.R
index afdb610..a916ac6 100644
--- a/R/make_embedding.R
+++ b/R/make_embedding.R

@@ -1,8 +1,38 @@
-gpt3.make_embedding = function(model_ = 'text-similarity-ada-001'
-                               , input_){
+#' Obtains text embeddings for a single character (string) from the GPT-3 API
+#'
+#' @description
+#' `gpt3_make_embedding()` sends a single [embedding request](https://beta.openai.com/docs/guides/embeddings) to the Open AI GPT-3 API.
+#' @details The function supports the text similarity embeddings for the four GPT-3 models as specified in the parameter list. The main difference between the four models is the sophistication of the embedding representation as indicated by the vector embedding size.
+#'   - Ada (1024 dimensions)
+#'   - Babbage (2048 dimensions)
+#'   - Curie (4096 dimensions)
+#'   - Davinci (12288 dimensions)
+#'
+#' Note that the dimension size (= vector length), speed and [associated costs](https://openai.com/api/pricing/) differ considerably.
+#'
+#' These vectors can be used for downstream tasks such as (vector) similarity calculations.
+#' @param input character that contains the text for which you want to obtain text embeddings from the GPT-3 model
+#' @param model a character vector that indicates the [similarity embedding model](https://beta.openai.com/docs/guides/embeddings/similarity-embeddings); one of "text-similarity-ada-001" (default), "text-similarity-curie-001", "text-similarity-babbage-001", "text-similarity-davinci-001"
+#' @return A numeric vector (= the embedding vector)
+#' @examples
+#' # First authenticate with your API key via `gpt3_authenticate('pathtokey')`
+#'
+#' # Once authenticated:
+#'
+#' ## Simple request with defaults:
+#' sample_string = "London is one of the most liveable cities in the world. The city is always full of energy and people. It's always a great place to explore and have fun."
+#' gpt3_make_embedding(input = sample_string)
+#'
+#' ## Change the model:
+#' #' gpt3_make_embedding(input = sample_string
+#'     , model = 'text-similarity-curie-001')
+#' @export
+gpt3_make_embedding = function(input
+                               , model = 'text-similarity-ada-001'
+                               ){
 
-  parameter_list = list(model = model_
-                        , input = input_)
+  parameter_list = list(model = model
+                        , input = input)
 
   request_base = httr::POST(url = url.embeddings
                             , body = parameter_list
@@ -12,7 +42,7 @@
 
   output_base = httr::content(request_base)
 
-  embedding_raw = toNumeric(unlist(output_base$data[[1]]$embedding))
+  embedding_raw = to_numeric(unlist(output_base$data[[1]]$embedding))
 
   return(embedding_raw)
 

diff --git a/R/make_request.R b/R/make_request.R
index 3fb02e0..7fa653a 100644
--- a/R/make_request.R
+++ b/R/make_request.R

@@ -1,7 +1,7 @@
 #' Makes a single completion request to the GPT-3 API
 #'
 #' @description
-#' `gpt3_simple_request()` sends a single [completion request](https://beta.openai.com/docs/api-reference/completions) to the Open AI GPT-3 API.
+#' `gpt3_make_request()` sends a single [completion request](https://beta.openai.com/docs/api-reference/completions) to the Open AI GPT-3 API.
 #' @details For a general guide on the completion requests, see [https://beta.openai.com/docs/guides/completion](https://beta.openai.com/docs/guides/completion). This function provides you with an R wrapper to send requests with the full range of request parameters as detailed on [https://beta.openai.com/docs/api-reference/completions](https://beta.openai.com/docs/api-reference/completions) and reproduced below.
 #'
 #' For the `best_of` parameter: When used with n, best_of controls the number of candidate completions and n specifies how many to return – best_of must be greater than n. Note that this is handled by the wrapper automatically   if(best_of <= n){ best_of = n}.
@@ -36,26 +36,26 @@
 #' # Once authenticated:
 #'
 #' ## Simple request with defaults:
-#' gpt3_simple_request(prompt_input = 'How old are you?')
+#' gpt3_make_request(prompt_input = 'How old are you?')
 #'
 #' ## Instruct GPT-3 to write ten research ideas of max. 150 tokens with some controls:
-#'gpt3_simple_request(prompt_input = 'Write a research idea about using text data to understand human behaviour:'
+#'gpt3_make_request(prompt_input = 'Write a research idea about using text data to understand human behaviour:'
 #'    , temperature = 0.8
 #'    , n = 10
 #'    , max_tokens = 150)
 #'
 #' ## For fully reproducible results, we need `temperature = 0`, e.g.:
-#' gpt3_simple_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
+#' gpt3_make_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
 #'     , temperature = 0.0
 #'     , max_tokens = 50)
 #'
 #' ## The same example with a different GPT-3 model:
-#' gpt3_simple_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
+#' gpt3_make_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
 #'     , model = 'text-babbage-001'
 #'     , temperature = 0.0
 #'     , max_tokens = 50)
 #' @export
-gpt3_simple_request = function(prompt_input
+gpt3_make_request = function(prompt_input
                               , model = 'text-davinci-002'
                               , output_type = 'complete'
                               , suffix = NULL

diff --git a/R/test_request.R b/R/test_request.R
index dbc3160..7b8c7e4 100644
--- a/R/test_request.R
+++ b/R/test_request.R

@@ -12,7 +12,7 @@
   check_apikey_form()
 
   test_prompt = 'Write a story about R Studio:'
-  test_outout = gpt3_simple_request(prompt_ = test_prompt
+  test_outout = gpt3_make_request(prompt_ = test_prompt
                                   , max_tokens = 100)
   print(paste0('.. test successful ..'))
 

diff --git a/R/utils.R b/R/utils.R
new file mode 100644
index 0000000..5c771d2
--- /dev/null
+++ b/R/utils.R

@@ -0,0 +1,14 @@
+#' Convert character vector of numeric values into a numeric vector
+#'
+#' @description
+#' Converts a character vector of numeric values into a numeric vector
+#' @param x a character vector of numeric values
+#'
+#' @return A numeric vector
+#' @examples
+#' to_numeric('12312')
+#' @export
+to_numeric = function(x){
+  proxy = as.numeric(as.character(x))
+  return(proxy)
+}

diff --git a/data/travel_blog_data.RData b/data/travel_blog_data.RData
new file mode 100644
index 0000000..017c49c
--- /dev/null
+++ b/data/travel_blog_data.RData
Binary files differ

diff --git a/man/gpt3_bunch_embedding.Rd b/man/gpt3_bunch_embedding.Rd
new file mode 100644
index 0000000..9a33169
--- /dev/null
+++ b/man/gpt3_bunch_embedding.Rd

@@ -0,0 +1,56 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/bunch_embedding.R
+\name{gpt3_bunch_embedding}
+\alias{gpt3_bunch_embedding}
+\title{Retrieves text embeddings for character input from a vector from the GPT-3 API}
+\usage{
+gpt3_bunch_embedding(
+  input_var,
+  id_var,
+  param_model = "text-similarity-ada-001"
+)
+}
+\arguments{
+\item{input_var}{character vector that contains the texts for which you want to obtain text embeddings from the GPT-3 model
+#' @param id_var (optional) character vector that contains the user-defined ids of the prompts. See details.}
+
+\item{param_model}{a character vector that indicates the \href{https://beta.openai.com/docs/guides/embeddings/similarity-embeddings}{similarity embedding model}; one of "text-similarity-ada-001" (default), "text-similarity-curie-001", "text-similarity-babbage-001", "text-similarity-davinci-001"}
+}
+\value{
+A data.table with the embeddings as separate columns; one row represents one input text. See details.
+}
+\description{
+\code{gpt3_bunch_embedding()} extends the single embeddings function \code{gpt3_make_embedding()} to allow for the processing of a whole vector
+}
+\details{
+The returned data.table contains the column \code{id} which indicates the text id (or its generic alternative if not specified) and the columns \code{dim_1} ... \verb{dim_\{max\}}, where \code{max} is the length of the text embeddings vector that the four different models return. For the default "Ada" model, these are 1024 dimensions (i.e., \code{dim_1}... \code{dim_1024}).
+
+The function supports the text similarity embeddings for the four GPT-3 models as specified in the parameter list. The main difference between the four models is the sophistication of the embedding representation as indicated by the vector embedding size.
+\itemize{
+\item Ada (1024 dimensions)
+\item Babbage (2048 dimensions)
+\item Curie (4096 dimensions)
+\item Davinci (12288 dimensions)
+}
+
+Note that the dimension size (= vector length), speed and \href{https://openai.com/api/pricing/}{associated costs} differ considerably.
+
+These vectors can be used for downstream tasks such as (vector) similarity calculations.
+}
+\examples{
+# First authenticate with your API key via `gpt3_authenticate('pathtokey')`
+
+# Use example data:
+## The data below were generated with the `gpt3_make_request()` function as follows:
+##### DO NOT RUN #####
+# travel_blog_data = gpt3_make_request(prompt_input = "Write a travel blog about a dog's journey through the UK:", temperature = 0.8, n = 10, max_tokens = 200)[[1]]
+##### END DO NOT RUN #####
+
+# You can load these data with:
+data("travel_blog_data") # the dataset contains 10 completions for the above request
+
+
+## Obtain text embeddings for the completion texts:
+gpt3_bunch_embedding(input = sample_string
+    , model = 'text-similarity-curie-001')
+}

diff --git a/man/gpt3_bunch_request.Rd b/man/gpt3_bunch_request.Rd
index 45c4949..347dfc4 100644
--- a/man/gpt3_bunch_request.Rd
+++ b/man/gpt3_bunch_request.Rd

@@ -58,13 +58,13 @@
 If \code{output_type} is "meta", only the data table in slot [\link{2}] is returned.
 }
 \description{
-\code{gpt3_bunch_request()} is the package's main function for rquests and takes as input a vector of prompts and processes each prompt as per the defined parameters. It extends the \code{gpt3_simple_request()} function to allow for bunch processing of requests to the Open AI GPT-3 API.
+\code{gpt3_bunch_request()} is the package's main function for rquests and takes as input a vector of prompts and processes each prompt as per the defined parameters. It extends the \code{gpt3_make_request()} function to allow for bunch processing of requests to the Open AI GPT-3 API.
 }
 \details{
 The easiest (and intended) use case for this function is to create a data.frame or data.table with variables that contain the prompts to be requested from GPT-3 and a prompt id (see examples below).
 For a general guide on the completion requests, see \url{https://beta.openai.com/docs/guides/completion}. This function provides you with an R wrapper to send requests with the full range of request parameters as detailed on \url{https://beta.openai.com/docs/api-reference/completions} and reproduced below.
 
-For the \code{best_of} parameter: The \code{gpt3_simple_request()} (which is used here in a vectorised manner) handles the issue that best_of must be greater than n by setting if(best_of <= n){ best_of = n}.
+For the \code{best_of} parameter: The \code{gpt3_make_request()} (which is used here in a vectorised manner) handles the issue that best_of must be greater than n by setting \code{if(best_of <= n){ best_of = n}}.
 
 If \code{id_var} is not provided, the function will use \code{prompt_1} ... \code{prompt_n} as id variable.
 

diff --git a/man/gpt3_make_embedding.Rd b/man/gpt3_make_embedding.Rd
new file mode 100644
index 0000000..b339ab5
--- /dev/null
+++ b/man/gpt3_make_embedding.Rd

@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/make_embedding.R
+\name{gpt3_make_embedding}
+\alias{gpt3_make_embedding}
+\title{Obtains text embeddings for a single character (string) from the GPT-3 API}
+\usage{
+gpt3_make_embedding(input, model = "text-similarity-ada-001")
+}
+\arguments{
+\item{input}{character that contains the text for which you want to obtain text embeddings from the GPT-3 model}
+
+\item{model}{a character vector that indicates the \href{https://beta.openai.com/docs/guides/embeddings/similarity-embeddings}{similarity embedding model}; one of "text-similarity-ada-001" (default), "text-similarity-curie-001", "text-similarity-babbage-001", "text-similarity-davinci-001"}
+}
+\value{
+A numeric vector (= the embedding vector)
+}
+\description{
+\code{gpt3_make_embedding()} sends a single \href{https://beta.openai.com/docs/guides/embeddings}{embedding request} to the Open AI GPT-3 API.
+}
+\details{
+The function supports the text similarity embeddings for the four GPT-3 models as specified in the parameter list. The main difference between the four models is the sophistication of the embedding representation as indicated by the vector embedding size.
+\itemize{
+\item Ada (1024 dimensions)
+\item Babbage (2048 dimensions)
+\item Curie (4096 dimensions)
+\item Davinci (12288 dimensions)
+}
+
+Note that the dimension size (= vector length), speed and \href{https://openai.com/api/pricing/}{associated costs} differ considerably.
+
+These vectors can be used for downstream tasks such as (vector) similarity calculations.
+}
+\examples{
+# First authenticate with your API key via `gpt3_authenticate('pathtokey')`
+
+# Once authenticated:
+
+## Simple request with defaults:
+sample_string = "London is one of the most liveable cities in the world. The city is always full of energy and people. It's always a great place to explore and have fun."
+gpt3_make_embedding(input = sample_string)
+
+## Change the model:
+#' gpt3_make_embedding(input = sample_string
+    , model = 'text-similarity-curie-001')
+}

diff --git a/man/gpt3_simple_request.Rd b/man/gpt3_make_request.Rd
similarity index 91%
rename from man/gpt3_simple_request.Rd
rename to man/gpt3_make_request.Rd
index 8570a76..504c112 100644
--- a/man/gpt3_simple_request.Rd
+++ b/man/gpt3_make_request.Rd

@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/make_request.R
-\name{gpt3_simple_request}
-\alias{gpt3_simple_request}
+\name{gpt3_make_request}
+\alias{gpt3_make_request}
 \title{Makes a single completion request to the GPT-3 API}
 \usage{
-gpt3_simple_request(
+gpt3_make_request(
   prompt_input,
   model = "text-davinci-002",
   output_type = "complete",
@@ -55,7 +55,7 @@
 If \code{output_type} is "meta", only the data table in slot [\link{2}] is returned.
 }
 \description{
-\code{gpt3_simple_request()} sends a single \href{https://beta.openai.com/docs/api-reference/completions}{completion request} to the Open AI GPT-3 API.
+\code{gpt3_make_request()} sends a single \href{https://beta.openai.com/docs/api-reference/completions}{completion request} to the Open AI GPT-3 API.
 }
 \details{
 For a general guide on the completion requests, see \url{https://beta.openai.com/docs/guides/completion}. This function provides you with an R wrapper to send requests with the full range of request parameters as detailed on \url{https://beta.openai.com/docs/api-reference/completions} and reproduced below.
@@ -75,21 +75,21 @@
 # Once authenticated:
 
 ## Simple request with defaults:
-gpt3_simple_request(prompt_input = 'How old are you?')
+gpt3_make_request(prompt_input = 'How old are you?')
 
 ## Instruct GPT-3 to write ten research ideas of max. 150 tokens with some controls:
-gpt3_simple_request(prompt_input = 'Write a research idea about using text data to understand human behaviour:'
+gpt3_make_request(prompt_input = 'Write a research idea about using text data to understand human behaviour:'
    , temperature = 0.8
    , n = 10
    , max_tokens = 150)
 
 ## For fully reproducible results, we need `temperature = 0`, e.g.:
-gpt3_simple_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
+gpt3_make_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
     , temperature = 0.0
     , max_tokens = 50)
 
 ## The same example with a different GPT-3 model:
-gpt3_simple_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
+gpt3_make_request(prompt_input = 'Finish this sentence:/n There is no easier way to learn R than'
     , model = 'text-babbage-001'
     , temperature = 0.0
     , max_tokens = 50)

diff --git a/man/to_numeric.Rd b/man/to_numeric.Rd
new file mode 100644
index 0000000..17cfc9c
--- /dev/null
+++ b/man/to_numeric.Rd

@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{to_numeric}
+\alias{to_numeric}
+\title{Convert character vector of numeric values into a numeric vector}
+\usage{
+to_numeric(x)
+}
+\arguments{
+\item{x}{a character vector of numeric values}
+}
+\value{
+A numeric vector
+}
+\description{
+Converts a character vector of numeric values into a numeric vector
+}
+\examples{
+to_numeric('12312')
+}
commit	287b30b9fe6bf2907c38e8b787fce2b358b80b11	[log] [tgz]
author	ben-aaron188 <ben-aaron188@users.noreply.github.com>	Sun Sep 11 16:46:37 2022 +0200
committer	ben-aaron188 <ben-aaron188@users.noreply.github.com>	Sun Sep 11 16:46:37 2022 +0200
tree	cd93a28705ace49cf667cad00cd9d315fa914107
parent	5bcd9116eb28c70dd5ecba750aa7e62083645266 [diff]