Commit 45017fdf authored by Prayas Jain's avatar Prayas Jain

pdf code

parent 79fb0e78
from fastapi import FastAPI
from src.api import router
from api import router
app = FastAPI()
......
import os
import pdfplumber
import re
import pandas as pd
# Path to the directory containing your PDF files (replace with your actual path)
pdf_folder_path = "./pdfs" # Example: './cv_pdfs' or use absolute path like 'C:/Users/John/cv_pdfs'
# List all PDF files in the folder
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')]
# Initialize an empty list to store the extracted data
extracted_data = []
# Function to clean the text (remove unnecessary spaces, special characters, etc.)
def clean_text(text):
text = text.strip() # Remove leading/trailing spaces
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
return text
# Function to extract email address from text
def extract_email(text):
email_regex = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
match = re.search(email_regex, text)
return match.group(0) if match else None
# Iterate through each PDF file in the folder
for idx, pdf_file in enumerate(pdf_files):
pdf_path = os.path.join(pdf_folder_path, pdf_file) # Join folder path and filename
# Open the PDF file
with pdfplumber.open(pdf_path) as pdf:
full_text = ""
# Iterate through each page of the PDF
for page_number, page in enumerate(pdf.pages):
# Extract text from the page
text = page.extract_text()
if text: # Ensure that text was extracted (non-empty text)
# Clean the text
cleaned_text = clean_text(text)
full_text += cleaned_text # Combine all text from all pages
email = extract_email(full_text)
# Store the extracted data as an object with 'id', 'name', 'phone', 'email', and 'text' keys
if full_text: # Only store if text was extracted
extracted_data.append({
email: full_text # Full extracted text from the CV
})
# Now extracted_data contains an array of objects with 'id', 'name', 'phone', 'email', and 'text'
# Example of the result:
# extracted_data = [
# {"id": 1, "name": "John Doe", "phone": "123-456-7890", "email": "john.doe@example.com", "text": "Full text of CV 1..."},
# {"id": 2, "name": "Jane Smith", "phone": "987-654-3210", "email": "jane.smith@example.com", "text": "Full text of CV 2..."},
# {"id": 3, "name": "Alice Johnson", "phone": "555-123-4567", "email": "alice.johnson@example.com", "text": "Full text of CV 3..."}
# ]
# Print the resulting dataset
print(extracted_data)
# Optionally: Save to a CSV or JSON file
import json
with open("extracted_cvs.json", "w") as json_file:
json.dump(extracted_data, json_file, indent=4)
# Or save it as a CSV (each row has an 'id', 'name', 'phone', 'email', 'text' column)
df = pd.DataFrame(extracted_data)
df.to_csv("extracted_cvs.csv", index=False)
......@@ -6,4 +6,6 @@ streamlit
fastapi
uvicorn
requests
router
\ No newline at end of file
PyPDF2
pdfplumber
pandas
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment