Blame - R/gpt3_embeddings.R - ids-kl/rgpt3

blob: 2a7b7c70119782a3f453651584c403118e8a68d4 [file] [log] [blame]

ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	1	#' Retrieves text embeddings for character input from a vector from the GPT-3 API
				2	#'
				3	#' @description
ben-aaron188	2b89c2a	2022-09-11 16:54:25 +0200	[diff] [blame]	4	#' `gpt3_embeddings()` extends the single embeddings function `gpt3_single_embedding()` to allow for the processing of a whole vector
ben-aaron188	68434e4	2022-12-24 20:04:21 +0100	[diff] [blame^]	5	#' @details The returned data.table contains the column `id` which indicates the text id (or its generic alternative if not specified) and the columns `dim_1` ... `dim_{max}`, where `max` is the length of the text embeddings vector that the different models (see below) return. For the default "Ada 2nd gen." model, these are 1536 dimensions (i.e., `dim_1`... `dim_1536`).
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	6	#'
ben-aaron188	68434e4	2022-12-24 20:04:21 +0100	[diff] [blame^]	7	#' The function supports the text similarity embeddings for the [five GPT-3 embeddings models](https://beta.openai.com/docs/guides/embeddings/embedding-models) as specified in the parameter list. It is strongly advised to use the second generation model "text-embedding-ada-002". The main difference between the five models is the size of the embedding representation as indicated by the vector embedding size and the pricing. The newest model (default) is the fastest, cheapest and highest quality one.
				8	#' - Ada 2nd generation `text-embedding-ada-002` (1536 dimensions)
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	9	#' - Ada (1024 dimensions)
				10	#' - Babbage (2048 dimensions)
				11	#' - Curie (4096 dimensions)
				12	#' - Davinci (12288 dimensions)
				13	#'
				14	#' Note that the dimension size (= vector length), speed and [associated costs](https://openai.com/api/pricing/) differ considerably.
				15	#'
				16	#' These vectors can be used for downstream tasks such as (vector) similarity calculations.
				17	#' @param input_var character vector that contains the texts for which you want to obtain text embeddings from the GPT-3 model
				18	#' #' @param id_var (optional) character vector that contains the user-defined ids of the prompts. See details.
ben-aaron188	68434e4	2022-12-24 20:04:21 +0100	[diff] [blame^]	19	#' @param param_model a character vector that indicates the [embedding model](https://beta.openai.com/docs/guides/embeddings/embedding-models); one of "text-embedding-ada-002" (default), "text-similarity-ada-001", "text-similarity-curie-001", "text-similarity-babbage-001", "text-similarity-davinci-001"
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	20	#' @return A data.table with the embeddings as separate columns; one row represents one input text. See details.
				21	#' @examples
				22	#' # First authenticate with your API key via `gpt3_authenticate('pathtokey')`
				23	#'
				24	#' # Use example data:
ben-aaron188	2b89c2a	2022-09-11 16:54:25 +0200	[diff] [blame]	25	#' ## The data below were generated with the `gpt3_single_request()` function as follows:
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	26	#' ##### DO NOT RUN #####
ben-aaron188	2b89c2a	2022-09-11 16:54:25 +0200	[diff] [blame]	27	#' # travel_blog_data = gpt3_single_request(prompt_input = "Write a travel blog about a dog's journey through the UK:", temperature = 0.8, n = 10, max_tokens = 200)[[1]]
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	28	#' ##### END DO NOT RUN #####
				29	#'
				30	#' # You can load these data with:
				31	#' data("travel_blog_data") # the dataset contains 10 completions for the above request
				32	#'
				33	#' ## Obtain text embeddings for the completion texts:
ben-aaron188	2b89c2a	2022-09-11 16:54:25 +0200	[diff] [blame]	34	#' emb_travelblogs = gpt3_embeddings(input_var = travel_blog_data$gpt3)
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	35	#' dim(emb_travelblogs)
				36	#' @export
ben-aaron188	2b89c2a	2022-09-11 16:54:25 +0200	[diff] [blame]	37	gpt3_embeddings = function(input_var
ben-aaron188	3818e7c	2022-09-08 17:49:01 +0200	[diff] [blame]	38	, id_var
ben-aaron188	68434e4	2022-12-24 20:04:21 +0100	[diff] [blame^]	39	, param_model = 'text-embedding-ada-002'){
ben-aaron188	3818e7c	2022-09-08 17:49:01 +0200	[diff] [blame]	40
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	41	data_length = length(input_var)
				42	if(missing(id_var)){
				43	data_id = paste0('prompt_', 1:data_length)
				44	} else {
				45	data_id = id_var
				46	}
ben-aaron188	3818e7c	2022-09-08 17:49:01 +0200	[diff] [blame]	47
				48	empty_list = list()
				49
				50	for(i in 1:data_length){
				51
				52	print(paste0('Embedding: ', i, '/', data_length))
				53
ben-aaron188	2b89c2a	2022-09-11 16:54:25 +0200	[diff] [blame]	54	row_outcome = gpt3_single_embedding(model = param_model
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	55	, input = input_var[i])
ben-aaron188	3818e7c	2022-09-08 17:49:01 +0200	[diff] [blame]	56
				57	empty_df = data.frame(t(row_outcome))
				58	names(empty_df) = paste0('dim_', 1:length(row_outcome))
ben-aaron188	287b30b	2022-09-11 16:46:37 +0200	[diff] [blame]	59	empty_df$id = data_id[i]
ben-aaron188	3818e7c	2022-09-08 17:49:01 +0200	[diff] [blame]	60
				61	empty_list[[i]] = empty_df
				62
				63
				64	}
				65
ben-aaron188	492669a	2022-10-24 19:11:13 +0200	[diff] [blame]	66	output_data = data.table::rbindlist(empty_list)
ben-aaron188	3818e7c	2022-09-08 17:49:01 +0200	[diff] [blame]	67
				68	return(output_data)
				69
				70	}