Commit 7d91b281 authored by LSING46's avatar LSING46

added python code and requirements

parent 1c607c4f
import nltk
import PyPDF2
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
import string
# Download NLTK resources (you can comment these out if already done)
nltk.download('stopwords')
nltk.download('punkt')
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# Function to preprocess text (remove stopwords, punctuation, etc.)
def preprocess_text(text):
# Convert text to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize text
words = nltk.word_tokenize(text)
# Remove stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))
words = [word for word in words if word not in stopwords]
return ' '.join(words)
# Function to extract text from PDF
def extract_pdf_text(file_path):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
# Function to get BERT embeddings for text
def get_bert_embeddings(text):
# Preprocess the text
text = preprocess_text(text)
# Tokenize and get BERT inputs
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
# Get embeddings from BERT
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
return embeddings
# Function to calculate similarity between CV and JD
def match_cv_and_jd(cv_text, jd_text):
# Get BERT embeddings for both CV and JD
cv_embeddings = get_bert_embeddings(cv_text)
jd_embeddings = get_bert_embeddings(jd_text)
# Calculate cosine similarity
similarity = cosine_similarity(cv_embeddings.unsqueeze(0), jd_embeddings.unsqueeze(0))
return similarity[0][0]
# Main function to test the matcher
if __name__ == "__main__":
# Example: Load CV and JD from PDF or string input
cv_text = extract_pdf_text('DevendraChaturvediAI_ML.pdf') # Replace with actual CV PDF file
jd_text = extract_pdf_text('jd.pdf') # Replace with actual JD PDF file
# Match CV and JD
similarity_score = match_cv_and_jd(cv_text, jd_text)
# Print the similarity score
print(f"Similarity Score: {similarity_score:.4f}")
# Define a threshold to determine if the CV matches the JD (e.g., 0.7 means 70% similarity)
threshold = 0.7
if similarity_score > threshold:
print("The CV is a good match for the job description.")
else:
print("The CV is not a good match for the job description.")
nltk
PyPDF2
scikit-learn
transformers
torch
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment