Commit 1ba254e7 authored by Aneeb Imamdin's avatar Aneeb Imamdin

Initial commit

parents
# Python
*.pyc
__pycache__/
*.pyo
*.pyd
.Python
env/
venv/
ENV/
VENV/
*.env
# Virtual environment
venv/
.venv/
env/
.env/
# IDE settings
.idea/
*.iml
.vscode/
# MacOS
.DS_Store
# Byte-compiled / optimized / DLL files
*.py[cod]
*$py.class
# Log files
*.log
# Jupyter Notebook checkpoints
.ipynb_checkpoints/
# pipenv
Pipfile.lock
# Google Cloud credentials
.gcloud/
# Project Title
This project performs vector search using Python and Google Cloud services like BigQuery. Follow the steps below to set up the environment, install dependencies, and authenticate with Google Cloud.
## Prerequisites
Ensure that the following are installed on your system:
- **Python** (version 3.6 or higher)
- **Pip** (Python's package installer)
- **Google Cloud SDK** for interacting with Google Cloud services
Ensure the following APIs are enabled in your Google Cloud project:
- **BigQuery API**
- **Vertex AI API**
### Install Google Cloud SDK
If you don't have the **Google Cloud SDK** installed, download and install it by following the official instructions [here](https://cloud.google.com/sdk/docs/install).
Once the SDK is installed, authenticate with your Google Cloud account by running the following command in your terminal or command prompt:
```bash
gcloud auth login
```
### Run the project
- Create Virtual Environment
```bash
python3 -m venv venv
```
- Activate Virtual Environment
```bash
source venv/bin/activate
```
- Install Project Requirements
```bash
pip install -r requirements.txt
```
- Replace the TableId with Actual TableId in line 91 and 106
```bash
YOUR_TABLE_ID
```
- Uncomment Line 123 to save data in BigQuery Table (Ignore This if already done)
```bash
save_to_bq(generate_embedding())
```
- Run text_embeddings.py If you want to store data in BQ (Ignore This if already done)
```bash
python text_embeddings.py
```
- Once Data is stored run the Flask Server
```bash
python index.py
```
- Development Server Started now access the URL from Browser and do the testing with Api
```bash
http://127.0.0.1:5000/vector-search?query=Test Query
```
### NOTES
- Data.docx file is already present in the code.
- If You want to change the data you are free to replace the content of the file.
- We are taking every paragraph and generating its embeddings, You are free to change this logic and define your preferred chunk size.
\ No newline at end of file
from flask import Flask, jsonify, request
from text_embeddings import vector_search_in_bigquery, embed_text
# Initialize the Flask application
app = Flask(__name__)
# Define a route for the GET API
@app.route('/vector-search', methods=['GET'])
def get_data():
query = request.args.get('query', default='Can i work from home ?')
query_embeddings = embed_text([query], "RETRIEVAL_DOCUMENT", 256)
data = vector_search_in_bigquery(query_embeddings[0])
# Return the data as a JSON response
return jsonify(data)
# Run the Flask app
if __name__ == '__main__':
app.run(debug=True)
\ No newline at end of file
from docx import Document
from google.cloud import bigquery
from typing import List, Optional
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
def read_file_content():
doc = Document("data.docx")
chunks = []
for paragraph in doc.paragraphs:
# Join All paragraphs in single text
chunks.append(paragraph.text)
return chunks
def split_text_into_chunks(text: str, max_chunk_size: int) -> List[str]:
if len(text) <= max_chunk_size:
return [text]
chunks = []
current_chunk = ""
for word in text.split():
if len(current_chunk) + len(word) + 1 <= max_chunk_size:
if current_chunk:
current_chunk += " " + word
else:
current_chunk = word
else:
chunks.append(current_chunk)
current_chunk = word
if current_chunk:
chunks.append(current_chunk)
return chunks
def embed_text(
texts: list = None,
task: str = "RETRIEVAL_DOCUMENT",
dimensionality: Optional[int] = 256,
) -> List[List[float]]:
"""Embeds texts with a pre-trained, foundational model.
Args:
texts (List[str]): A list of texts to be embedded.
task (str): The task type for embedding. Check the available tasks in the model's documentation.
dimensionality (Optional[int]): The dimensionality of the output embeddings.
Returns:
List[List[float]]: A list of lists containing the embedding vectors for each input text
"""
if texts is None:
texts = ["banana muffins? ", "banana bread? banana muffins?"]
model = TextEmbeddingModel.from_pretrained("text-embedding-004")
inputs = [TextEmbeddingInput(text, task) for text in texts]
embeddings = model.get_embeddings(inputs)
return [embedding.values for embedding in embeddings]
def split_text_into_sentences(text):
sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
return sentences
def generate_embedding():
# To Store embedding and text
data = []
# Read content from file
chunks = read_file_content()
# Create Embedding for each chunk
for chunk in chunks:
embedding = embed_text([chunk], "RETRIEVAL_DOCUMENT", 256)
data.append({
"text": chunk,
"embeddings": embedding[0]
})
return data
def save_to_bq(data):
# Initialize BigQuery client
client = bigquery.Client()
# Define the table ID
table_id = "YOUR_TABLE_ID"
# Insert the rows into the BigQuery table
errors = client.insert_rows_json(table_id, data)
# Check for errors
if not errors:
print("New rows have been added.")
else:
print("Errors:", errors)
def vector_search_in_bigquery(query_embedding):
sql_query = f"""
SELECT base.text, distance
FROM VECTOR_SEARCH(
TABLE ai_practice_dataset.ai_poc_data , 'embeddings',
(SELECT {query_embedding} as embed) , top_k => 5, distance_type => 'COSINE')
"""
client = bigquery.Client()
results = client.query(sql_query).result()
parsed_results = []
for row in results:
row_dict = dict(row)
parsed_results.append(row_dict)
return parsed_results
# Run the Flask app
if __name__ == '__main__':
# Uncomment this line if you want to insert the data
# save_to_bq(generate_embedding())
print("Running")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment