Json data embeddings support added with data file

1da2f467 · Aneeb Imamdin · 4aacb197 · 1da2f467 · 1da2f467 · 1da2f467
Commit 1da2f467 authored Oct 03, 2024 by Aneeb Imamdin
Show whitespace changes
Inline Side-by-side

Showing with 2821 additions and 1 deletion

json_embeddings.py json_embeddings.py +139 -0

products_updated.json products_updated.json +2681 -0

text_embeddings.py text_embeddings.py +1 -1

No files found.
--- a/json_embeddings.py
+++ b/json_embeddings.py
+import json
+from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
+from typing import List, Optional
+from google.cloud import bigquery
+
+
+def convert_to_text(row):
+    text_content = 'Title : {}\nDescription : {}\nBrand : {}\nPrice : {}\n'.format(row['title'], row['description'], row['brand'], row['price'])
+    text_content += 'SellerName : {}\nCategories : {}\n'.format(row['sellerName'], row['categories'])
+    if row['nutritionGroups']:
+        if row['nutritionGroups']['diet']:
+            result = ", ".join([item['name'] for item in row['nutritionGroups']['diet']])
+            text_content += 'Diet : {}\n'.format(result)
+
+        if row['nutritionGroups']['dietaryRestriction']:
+            result = ", ".join([item['name'] for item in row['nutritionGroups']['dietaryRestriction']])
+            text_content += 'Diet Restrictions : {}\n'.format(result)
+
+        if row['nutritionGroups']['nutritionContent']:
+            result = ", ".join([item['name'] for item in row['nutritionGroups']['nutritionContent']])
+            text_content += 'Nutrition Content : {}\n'.format(result)
+
+    if row['marketingContent']:
+        if row['marketingContent'][0]:
+            if row['marketingContent'][0]['marketingTitles']:
+                for tile in row['marketingContent'][0]['marketingTitles']:
+                    if tile['name'] and tile['name'] == 'Varietal':
+                        text_content += 'Varietal : {}\n'.format(tile['value'])
+
+    return text_content
+
+
+def convert_to_sentence(row):
+    text_content = 'A {} by {}, priced at {} dollars'.format(row['title'], row['title'], row['price'])
+    if row['nutritionGroups']:
+        if row['nutritionGroups']['diet']:
+            result = ", ".join([item['name'] for item in row['nutritionGroups']['diet']])
+            text_content += ', suitable for {} Diets'.format(result)
+
+        if row['nutritionGroups']['dietaryRestriction']:
+            result = ", ".join([item['name'] for item in row['nutritionGroups']['dietaryRestriction']])
+            text_content += ', and it is {}'.format(result)
+
+        if row['nutritionGroups']['nutritionContent']:
+            result = ", ".join([item['name'] for item in row['nutritionGroups']['nutritionContent']])
+            text_content += ', with {}'.format(result)
+
+    text_content += '. It’s listed in the {} and is sold by {}'.format(row['categories'], row['sellerName'])
+
+    if row['marketingContent']:
+        if row['marketingContent'][0]:
+            if row['marketingContent'][0]['marketingTitles']:
+                for tile in row['marketingContent'][0]['marketingTitles']:
+                    if tile['name'] and tile['name'] == 'Varietal':
+                        text_content += ' with a {} varietal'.format(tile['value'])
+
+    return  text_content
+
+
+def embed_text(
+    texts: list = None,
+    task: str = "RETRIEVAL_DOCUMENT",
+    dimensionality: Optional[int] = 256,
+) -> List[List[float]]:
+    """Embeds texts with a pre-trained, foundational model.
+    Args:
+        texts (List[str]): A list of texts to be embedded.
+        task (str): The task type for embedding. Check the available tasks in the model's documentation.
+        dimensionality (Optional[int]): The dimensionality of the output embeddings.
+    Returns:
+        List[List[float]]: A list of lists containing the embedding vectors for each input text
+    """
+    if texts is None:
+        texts = ["banana muffins? ", "banana bread? banana muffins?"]
+
+    model = TextEmbeddingModel.from_pretrained("text-embedding-004")
+    inputs = [TextEmbeddingInput(text, task) for text in texts]
+    embeddings = model.get_embeddings(inputs)
+    return [embedding.values for embedding in embeddings]
+
+
+def generate_embeddings():
+    data_array = []
+    file_path = 'products_updated.json'
+    # Open and read the JSON file
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+
+    for row in data:
+        text_row = convert_to_text(row)
+        embedding = embed_text([text_row], "RETRIEVAL_DOCUMENT", 256)
+        data_array.append({
+            "text": text_row,
+            "bpnId": row['bpnId'],
+            "embeddings": embedding[0]
+        })
+
+    return data_array
+
+
+def generate_embeddings_sentence():
+    data_array = []
+    file_path = 'products_updated.json'
+    # Open and read the JSON file
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+
+    for row in data:
+        text_row = convert_to_sentence(row)
+        embedding = embed_text([text_row], "RETRIEVAL_DOCUMENT", 256)
+        data_array.append({
+            "text": text_row,
+            "bpnId": row['bpnId'],
+            "embeddings": embedding[0]
+        })
+
+    return data_array
+
+
+def save_to_bq(data):
+    # Initialize BigQuery client
+    client = bigquery.Client()
+
+    # Define the table ID
+    table_id = "ai_practice_dataset.product_sentence_embeddings"
+    # Insert the rows into the BigQuery table
+    errors = client.insert_rows_json(table_id, data)
+
+    # Check for errors
+    if not errors:
+        print("New rows have been added.")
+    else:
+        print("Errors:", errors)
+
+
+if __name__ == "__main__":
+    pass
+    # generate_embeddings_sentence()
+    # save_to_bq(generate_embeddings_sentence())
--- a/products_updated.json
+++ b/products_updated.json
--- a/text_embeddings.py
+++ b/text_embeddings.py
@@ -103,7 +103,7 @@ def vector_search_in_bigquery(query_embedding, table):
    sql_query = f"""
        SELECT base.text, distance
        FROM VECTOR_SEARCH(
-   TABLE DATSET.{table} , 'embeddings',
+   TABLE ai_practice_dataset.{table} , 'embeddings',
   (SELECT {query_embedding} as embed) , top_k => 5, distance_type => 'COSINE')
    """