Initial commit

1ba254e7 · Aneeb Imamdin · 1ba254e7 · 1ba254e7 · 1ba254e7 · 1ba254e7
Commit 1ba254e7 authored Sep 10, 2024 by Aneeb Imamdin
Showing with 263 additions and 0 deletions

.gitignore .gitignore +41 -0

README.md README.md +74 -0

index.py index.py +19 -0

requirements.txt requirements.txt +5 -0

text_embeddings.py text_embeddings.py +124 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
+# Python
+*.pyc
+__pycache__/
+*.pyo
+*.pyd
+.Python
+env/
+venv/
+ENV/
+VENV/
+*.env
+# Virtual environment
+venv/
+.venv/
+env/
+.env/
+# IDE settings
+.idea/
+*.iml
+.vscode/
+# MacOS
+.DS_Store
+# Byte-compiled / optimized / DLL files
+*.py[cod]
+*$py.class
+# Log files
+*.log
+# Jupyter Notebook checkpoints
+.ipynb_checkpoints/
+# pipenv
+Pipfile.lock
+# Google Cloud credentials
+.gcloud/
--- a/README.md
+++ b/README.md
+# Project Title
+This project performs vector search using Python and Google Cloud services like BigQuery. Follow the steps below to set up the environment, install dependencies, and authenticate with Google Cloud.
+## Prerequisites
+Ensure that the following are installed on your system:
+- **Python** (version 3.6 or higher)
+- **Pip** (Python's package installer)
+- **Google Cloud SDK** for interacting with Google Cloud services
+Ensure the following APIs are enabled in your Google Cloud project:
+- **BigQuery API**
+- **Vertex AI API**
+### Install Google Cloud SDK
+If you don't have the **Google Cloud SDK** installed, download and install it by following the official instructions [here](https://cloud.google.com/sdk/docs/install).
+Once the SDK is installed, authenticate with your Google Cloud account by running the following command in your terminal or command prompt:
+```bash
+gcloud auth login
+```
+### Run the project
+- Create Virtual Environment 
+```bash
+python3 -m venv venv
+```
+- Activate Virtual Environment 
+```bash
+source venv/bin/activate
+```
+- Install Project Requirements 
+```bash
+pip install -r requirements.txt
+```
+- Replace the TableId with Actual TableId in line 91 and 106
+```bash
+YOUR_TABLE_ID
+```
+- Uncomment Line 123 to save data in BigQuery Table (Ignore This if already done)
+```bash
+save_to_bq(generate_embedding())
+```
+- Run text_embeddings.py If you want to store data in BQ (Ignore This if already done)
+```bash
+python text_embeddings.py
+```
+- Once Data is stored run the Flask Server 
+```bash
+python index.py
+```
+- Development Server Started now access the URL from Browser and do the testing with Api
+```bash
+http://127.0.0.1:5000/vector-search?query=Test Query
+```
+### NOTES
+- Data.docx file is already present in the code.
+- If You want to change the data you are free to replace the content of the file.
+- We are taking every paragraph and generating its embeddings, You are free to change this logic and define your preferred chunk size.
\ No newline at end of file
--- a/index.py
+++ b/index.py
+from flask import Flask, jsonify, request
+from text_embeddings import vector_search_in_bigquery, embed_text
+# Initialize the Flask application
+app = Flask(__name__)
+# Define a route for the GET API
+@app.route('/vector-search', methods=['GET'])
+def get_data():
+    query = request.args.get('query', default='Can i work from home ?')
+    query_embeddings = embed_text([query], "RETRIEVAL_DOCUMENT", 256)
+    data = vector_search_in_bigquery(query_embeddings[0])
+    # Return the data as a JSON response
+    return jsonify(data)
+# Run the Flask app
+if __name__ == '__main__':
+    app.run(debug=True)
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+flask
+python-docx
+vertexai
+google-cloud-aiplatform
\ No newline at end of file
--- a/text_embeddings.py
+++ b/text_embeddings.py
+from docx import Document
+from google.cloud import bigquery
+from typing import List, Optional
+from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
+def read_file_content():
+    doc = Document("data.docx")
+    chunks = []
+    for paragraph in doc.paragraphs:
+        # Join All paragraphs in single text
+        chunks.append(paragraph.text)
+    return chunks
+def split_text_into_chunks(text: str, max_chunk_size: int) -> List[str]:
+    if len(text) <= max_chunk_size:
+        return [text]
+    chunks = []
+    current_chunk = ""
+    for word in text.split():
+        if len(current_chunk) + len(word) + 1 <= max_chunk_size:
+            if current_chunk:
+                current_chunk += " " + word
+            else:
+                current_chunk = word
+        else:
+            chunks.append(current_chunk)
+            current_chunk = word
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def embed_text(
+    texts: list = None,
+    task: str = "RETRIEVAL_DOCUMENT",
+    dimensionality: Optional[int] = 256,
+) -> List[List[float]]:
+    """Embeds texts with a pre-trained, foundational model.
+    Args:
+        texts (List[str]): A list of texts to be embedded.
+        task (str): The task type for embedding. Check the available tasks in the model's documentation.
+        dimensionality (Optional[int]): The dimensionality of the output embeddings.
+    Returns:
+        List[List[float]]: A list of lists containing the embedding vectors for each input text
+    """
+    if texts is None:
+        texts = ["banana muffins? ", "banana bread? banana muffins?"]
+    model = TextEmbeddingModel.from_pretrained("text-embedding-004")
+    inputs = [TextEmbeddingInput(text, task) for text in texts]
+    embeddings = model.get_embeddings(inputs)
+    return [embedding.values for embedding in embeddings]
+def split_text_into_sentences(text):
+    sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
+    return sentences
+def generate_embedding():
+    # To Store embedding and text
+    data = []
+    # Read content from file
+    chunks = read_file_content()
+    # Create Embedding for each chunk
+    for chunk in chunks:
+        embedding = embed_text([chunk], "RETRIEVAL_DOCUMENT", 256)
+        data.append({
+            "text": chunk,
+            "embeddings": embedding[0]
+        })
+    return data
+def save_to_bq(data):
+    # Initialize BigQuery client
+    client = bigquery.Client()
+    # Define the table ID
+    table_id = "YOUR_TABLE_ID"
+    # Insert the rows into the BigQuery table
+    errors = client.insert_rows_json(table_id, data)
+    # Check for errors
+    if not errors:
+        print("New rows have been added.")
+    else:
+        print("Errors:", errors)
+def vector_search_in_bigquery(query_embedding):
+    sql_query = f"""
+        SELECT base.text, distance
+        FROM VECTOR_SEARCH(
+   TABLE ai_practice_dataset.ai_poc_data , 'embeddings',
+   (SELECT {query_embedding} as embed) , top_k => 5, distance_type => 'COSINE')
+    """
+    client = bigquery.Client()
+    results = client.query(sql_query).result()
+    parsed_results = []
+    for row in results:
+        row_dict = dict(row)
+        parsed_results.append(row_dict)
+    return parsed_results
+# Run the Flask app
+if __name__ == '__main__':
+    # Uncomment this line if you want to insert the data
+    # save_to_bq(generate_embedding())
+    print("Running")