Commit 0c63c8ce authored by BRellu's avatar BRellu

fix the feature specific review extract

parent 261abd78
ASTRA_DB_API_ENDPOINT="https://0c52512b-0c65-4e70-ac47-71edf5244a82-us-east-2.apps.astra.datastax.com"
ASTRA_DB_APPLICATION_TOKEN="AstraCS:iokawYtcXZopdIyRQIghuror:ea94254f5d22f2a7d3f4d27937bb8455c3cc8a2f23837d99d2557e85c9d5dbe8"
ASTRA_DB_APPLICATION_TOKEN="AstraCS:EvXpFFafufegdQJvhqlYxmxt:ef86b5996013b12140b69254bd554d7e8e10eb5a7137859b9c432f92a5a3b65c"
ASTRA_DB_NAMESPACE="default_keyspace"
HF_TOKEN="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"
\ No newline at end of file
HF_TOKEN="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"
GROQ_API_KEY="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"
......@@ -26,4 +26,5 @@ Transform unstructured product reviews into actionable, feature-specific insight
Steps to run local:
pip install poetry
poetry lock
poetry install
......@@ -433,6 +433,18 @@ files = [
[package.dependencies]
packaging = "*"
[[package]]
name = "distro"
version = "1.9.0"
description = "Distro - an OS platform information API"
optional = false
python-versions = ">=3.6"
groups = ["main"]
files = [
{file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
{file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
]
[[package]]
name = "dnspython"
version = "2.7.0"
......@@ -738,6 +750,26 @@ files = [
docs = ["Sphinx", "furo"]
test = ["objgraph", "psutil"]
[[package]]
name = "groq"
version = "0.15.0"
description = "The official Python library for the groq API"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "groq-0.15.0-py3-none-any.whl", hash = "sha256:c200558b67fee4b4f2bb89cc166337e3419a68c23280065770f8f8b0729c79ef"},
{file = "groq-0.15.0.tar.gz", hash = "sha256:9ad08ba6156c67d0975595a8515b517f22ff63158e063c55192e161ed3648af1"},
]
[package.dependencies]
anyio = ">=3.5.0,<5"
distro = ">=1.7.0,<2"
httpx = ">=0.23.0,<1"
pydantic = ">=1.9.0,<3"
sniffio = "*"
typing-extensions = ">=4.10,<5"
[[package]]
name = "h11"
version = "0.14.0"
......@@ -1071,6 +1103,22 @@ PyYAML = ">=5.3"
tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
typing-extensions = ">=4.7"
[[package]]
name = "langchain-groq"
version = "0.1.10"
description = "An integration package connecting Groq and LangChain"
optional = false
python-versions = "<4.0,>=3.8.1"
groups = ["main"]
files = [
{file = "langchain_groq-0.1.10-py3-none-any.whl", hash = "sha256:ef18331ba31b13ea44b7ac5467ad31b7efdf5bfe7c219cdf585311811b06f6b8"},
{file = "langchain_groq-0.1.10.tar.gz", hash = "sha256:a5e69190fd89420ab759845956f64230473c8229446c8e7022b1cbc269a51078"},
]
[package.dependencies]
groq = ">=0.4.1,<1"
langchain-core = ">=0.2.39,<0.3.0"
[[package]]
name = "langchain-huggingface"
version = "0.0.3"
......@@ -3239,4 +3287,4 @@ propcache = ">=0.2.0"
[metadata]
lock-version = "2.1"
python-versions = "^3.9"
content-hash = "8995a61b617d979c1ed855de907c850c048335f5f4fed85851a371e97ab4cfe1"
content-hash = "a980d3a86400e70c44fece09364c23407b1a898daad559746447df6ff7773437"
......@@ -16,6 +16,7 @@ python-dotenv = "^1.0.0"
transformers = "^4.30.0"
numpy = "^1.24.0"
pydantic = "^2.10.6"
langchain_groq = "^0.1.10"
[tool.poetry.dev-dependencies]
......
# src/reviewsense/core/llm.py
from langchain_groq import ChatGroq
from functools import lru_cache
@lru_cache()
def get_llm():
"""Get singleton LLM instance"""
return ChatGroq(
model_name="llama-3.3-70b-versatile",
temperature=1
)
\ No newline at end of file
from pydantic import BaseModel
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from typing import List, Optional
from reviewsense.core.llm import get_llm
class FeatureReviews(BaseModel):
feature_reviews: List[dict] # Each review will include a sentence and sentiment
class FeatureExtractor:
def __init__(self):
self.llm = get_llm()
self.parser = self._create_reviews_parser()
self.prompt = self._create_extraction_prompt()
def _create_reviews_parser(self) -> JsonOutputParser:
"""Create JSON parser for feature-specific reviews extraction"""
return JsonOutputParser()
def _create_extraction_prompt(self) -> ChatPromptTemplate:
"""Create prompt for extracting feature-specific reviews with enhanced rules and sentiment analysis."""
template = """Extract sentences about the given feature from the list of reviews.
Rules:
- Extract only parts discussing the specific feature.
- Remove unrelated parts connected by 'and' or 'but'.
- Keep original wording and capitalization.
- If there is only one review, apply the same rules to extract sentences about the feature.
Reviews: {reviews}
Feature: {feature}
Return only the parts discussing the specific feature and perform sentiment analysis for each extracted sentence in this JSON format:
{{
"feature_reviews": [
{{
"sentence": "relevant sentence 1",
"sentiment": "positive/negative/neutral"
}},
{{
"sentence": "relevant sentence 2",
"sentiment": "positive/negative/neutral"
}}
]
}}
"""
return ChatPromptTemplate.from_template(template)
def extract_feature_reviews(self, reviews: List[str], feature: str) -> List[dict]:
"""
Extract feature-specific sentences from reviews with sentiment analysis
Args:
reviews: List of review texts
feature: Target feature to extract
Returns:
List[dict]: Feature-specific sentences with sentiment analysis
"""
try:
chain = self.prompt | self.llm | self.parser
reviews_text = "\n".join(reviews)
result = chain.invoke({
"reviews": reviews_text,
"feature": feature
})
parsed_data = FeatureReviews(**result) # Validate and parse the result
return parsed_data.feature_reviews
except Exception as e:
print(f"Error extracting feature reviews: {e}")
return []
......@@ -2,22 +2,29 @@
from typing import List, Dict
from .retrieval import get_vector_store
from .feature_extractor import FeatureExtractor
class ReviewFetcher:
"""Class for fetching reviews from the vector store"""
def __init__(self):
self.vector_store = get_vector_store()
self.feature_extractor = FeatureExtractor()
def fetch_reviews(self, product_id: str, features: List[str], threshold: float = 0.6) -> Dict[str, List[str]]:
feature_reviews = {}
for feature in features:
filter_criteria = {"title": product_id}
documents = self.vector_store.similarity_search_with_score_id(query=self.generate_feature_query(feature), k=100, filter=filter_criteria)
filtered_reviews = [doc.page_content for doc, score, _ in documents if score > threshold]
feature_reviews[feature] = filtered_reviews
# documents = self.vector_store.similarity_search_with_score_id(query=self.generate_feature_query(feature), k=100, filter=filter_criteria)
documents = self.vector_store.similarity_search_with_score_id(query=feature, k=100, filter=filter_criteria)
if len(documents) != 0:
filtered_reviews = [doc.page_content for doc, score, _ in documents if score > threshold]
if len(filtered_reviews) != 0 :
extracted_reviews = self.feature_extractor.extract_feature_reviews(filtered_reviews, feature)
if len(extracted_reviews)!=0 :
feature_reviews[feature] = extracted_reviews
return feature_reviews
def generate_feature_query(self, feature: str) -> str:
......
......@@ -11,7 +11,7 @@ class ReviewRater:
def __init__(self):
self.sentiment_analyzer = pipeline("sentiment-analysis")
def generate_feature_ratings(self, reviews_by_feature: Dict[str, List[str]]) -> Dict[str, Optional[float]]:
def generate_feature_ratings(self, reviews_by_feature: Dict[str, List[dict]]) -> Dict[str, Optional[float]]:
feature_ratings = {}
for feature, reviews in reviews_by_feature.items():
......@@ -21,11 +21,10 @@ class ReviewRater:
ratings = []
for review in reviews:
sentiment = self.sentiment_analyzer(review)[0]
if "positive" in sentiment["label"].lower():
sentiment = review.get('sentiment')
if "positive" in sentiment:
ratings.append(5)
elif "negative" in sentiment["label"].lower():
elif "negative" in sentiment:
ratings.append(1)
else:
ratings.append(3)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment