added python code and requirements

7d91b281 · LSING46 · 1c607c4f · 7d91b281 · 7d91b281
Commit 7d91b281 authored Jan 18, 2025 by LSING46
Show whitespace changes
Inline Side-by-side

Showing with 96 additions and 0 deletions

cv_filter_tool.py src/cv_filter_tool.py +91 -0

requirements.text src/requirements.text +5 -0

No files found.
--- a/src/cv_filter_tool.py
+++ b/src/cv_filter_tool.py
+import nltk
+import PyPDF2
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import BertTokenizer, BertModel
+import torch
+import string
+
+# Download NLTK resources (you can comment these out if already done)
+nltk.download('stopwords')
+nltk.download('punkt')
+
+# Initialize the BERT tokenizer and model
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertModel.from_pretrained('bert-base-uncased')
+
+
+# Function to preprocess text (remove stopwords, punctuation, etc.)
+def preprocess_text(text):
+    # Convert text to lowercase
+    text = text.lower()
+
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+
+    # Tokenize text
+    words = nltk.word_tokenize(text)
+
+    # Remove stopwords
+    stopwords = set(nltk.corpus.stopwords.words('english'))
+    words = [word for word in words if word not in stopwords]
+
+    return ' '.join(words)
+
+
+# Function to extract text from PDF
+def extract_pdf_text(file_path):
+    with open(file_path, 'rb') as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ''
+        for page in reader.pages:
+            text += page.extract_text()
+        return text
+
+
+# Function to get BERT embeddings for text
+def get_bert_embeddings(text):
+    # Preprocess the text
+    text = preprocess_text(text)
+
+    # Tokenize and get BERT inputs
+    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
+
+    # Get embeddings from BERT
+    with torch.no_grad():
+        outputs = model(**inputs)
+        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
+
+    return embeddings
+
+
+# Function to calculate similarity between CV and JD
+def match_cv_and_jd(cv_text, jd_text):
+    # Get BERT embeddings for both CV and JD
+    cv_embeddings = get_bert_embeddings(cv_text)
+    jd_embeddings = get_bert_embeddings(jd_text)
+
+    # Calculate cosine similarity
+    similarity = cosine_similarity(cv_embeddings.unsqueeze(0), jd_embeddings.unsqueeze(0))
+
+    return similarity[0][0]
+
+
+# Main function to test the matcher
+if __name__ == "__main__":
+    # Example: Load CV and JD from PDF or string input
+    cv_text = extract_pdf_text('DevendraChaturvediAI_ML.pdf')  # Replace with actual CV PDF file
+    jd_text = extract_pdf_text('jd.pdf')  # Replace with actual JD PDF file
+
+    # Match CV and JD
+    similarity_score = match_cv_and_jd(cv_text, jd_text)
+
+    # Print the similarity score
+    print(f"Similarity Score: {similarity_score:.4f}")
+
+    # Define a threshold to determine if the CV matches the JD (e.g., 0.7 means 70% similarity)
+    threshold = 0.7
+    if similarity_score > threshold:
+        print("The CV is a good match for the job description.")
+    else:
+        print("The CV is not a good match for the job description.")
+
--- a/src/requirements.text
+++ b/src/requirements.text
+nltk
+PyPDF2
+scikit-learn
+transformers
+torch
\ No newline at end of file