Commit 1da2f467 authored by Aneeb Imamdin's avatar Aneeb Imamdin

Json data embeddings support added with data file

parent 4aacb197
import json
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from typing import List, Optional
from google.cloud import bigquery
def convert_to_text(row):
text_content = 'Title : {}\nDescription : {}\nBrand : {}\nPrice : {}\n'.format(row['title'], row['description'], row['brand'], row['price'])
text_content += 'SellerName : {}\nCategories : {}\n'.format(row['sellerName'], row['categories'])
if row['nutritionGroups']:
if row['nutritionGroups']['diet']:
result = ", ".join([item['name'] for item in row['nutritionGroups']['diet']])
text_content += 'Diet : {}\n'.format(result)
if row['nutritionGroups']['dietaryRestriction']:
result = ", ".join([item['name'] for item in row['nutritionGroups']['dietaryRestriction']])
text_content += 'Diet Restrictions : {}\n'.format(result)
if row['nutritionGroups']['nutritionContent']:
result = ", ".join([item['name'] for item in row['nutritionGroups']['nutritionContent']])
text_content += 'Nutrition Content : {}\n'.format(result)
if row['marketingContent']:
if row['marketingContent'][0]:
if row['marketingContent'][0]['marketingTitles']:
for tile in row['marketingContent'][0]['marketingTitles']:
if tile['name'] and tile['name'] == 'Varietal':
text_content += 'Varietal : {}\n'.format(tile['value'])
return text_content
def convert_to_sentence(row):
text_content = 'A {} by {}, priced at {} dollars'.format(row['title'], row['title'], row['price'])
if row['nutritionGroups']:
if row['nutritionGroups']['diet']:
result = ", ".join([item['name'] for item in row['nutritionGroups']['diet']])
text_content += ', suitable for {} Diets'.format(result)
if row['nutritionGroups']['dietaryRestriction']:
result = ", ".join([item['name'] for item in row['nutritionGroups']['dietaryRestriction']])
text_content += ', and it is {}'.format(result)
if row['nutritionGroups']['nutritionContent']:
result = ", ".join([item['name'] for item in row['nutritionGroups']['nutritionContent']])
text_content += ', with {}'.format(result)
text_content += '. It’s listed in the {} and is sold by {}'.format(row['categories'], row['sellerName'])
if row['marketingContent']:
if row['marketingContent'][0]:
if row['marketingContent'][0]['marketingTitles']:
for tile in row['marketingContent'][0]['marketingTitles']:
if tile['name'] and tile['name'] == 'Varietal':
text_content += ' with a {} varietal'.format(tile['value'])
return text_content
def embed_text(
texts: list = None,
task: str = "RETRIEVAL_DOCUMENT",
dimensionality: Optional[int] = 256,
) -> List[List[float]]:
"""Embeds texts with a pre-trained, foundational model.
Args:
texts (List[str]): A list of texts to be embedded.
task (str): The task type for embedding. Check the available tasks in the model's documentation.
dimensionality (Optional[int]): The dimensionality of the output embeddings.
Returns:
List[List[float]]: A list of lists containing the embedding vectors for each input text
"""
if texts is None:
texts = ["banana muffins? ", "banana bread? banana muffins?"]
model = TextEmbeddingModel.from_pretrained("text-embedding-004")
inputs = [TextEmbeddingInput(text, task) for text in texts]
embeddings = model.get_embeddings(inputs)
return [embedding.values for embedding in embeddings]
def generate_embeddings():
data_array = []
file_path = 'products_updated.json'
# Open and read the JSON file
with open(file_path, 'r') as file:
data = json.load(file)
for row in data:
text_row = convert_to_text(row)
embedding = embed_text([text_row], "RETRIEVAL_DOCUMENT", 256)
data_array.append({
"text": text_row,
"bpnId": row['bpnId'],
"embeddings": embedding[0]
})
return data_array
def generate_embeddings_sentence():
data_array = []
file_path = 'products_updated.json'
# Open and read the JSON file
with open(file_path, 'r') as file:
data = json.load(file)
for row in data:
text_row = convert_to_sentence(row)
embedding = embed_text([text_row], "RETRIEVAL_DOCUMENT", 256)
data_array.append({
"text": text_row,
"bpnId": row['bpnId'],
"embeddings": embedding[0]
})
return data_array
def save_to_bq(data):
# Initialize BigQuery client
client = bigquery.Client()
# Define the table ID
table_id = "ai_practice_dataset.product_sentence_embeddings"
# Insert the rows into the BigQuery table
errors = client.insert_rows_json(table_id, data)
# Check for errors
if not errors:
print("New rows have been added.")
else:
print("Errors:", errors)
if __name__ == "__main__":
pass
# generate_embeddings_sentence()
# save_to_bq(generate_embeddings_sentence())
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -103,7 +103,7 @@ def vector_search_in_bigquery(query_embedding, table):
sql_query = f"""
SELECT base.text, distance
FROM VECTOR_SEARCH(
TABLE DATSET.{table} , 'embeddings',
TABLE ai_practice_dataset.{table} , 'embeddings',
(SELECT {query_embedding} as embed) , top_k => 5, distance_type => 'COSINE')
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment