pdf code

45017fdf · Prayas Jain · 79fb0e78 · 45017fdf · 45017fdf · 45017fdf
Commit 45017fdf authored Jan 18, 2025 by Prayas Jain
6 changed files
--- a/src/main.py
+++ b/src/main.py
 from fastapi import FastAPI
-from src.api import router
+from api import router

 app = FastAPI()


--- a/src/pdf.py
+++ b/src/pdf.py
+import os
+import pdfplumber
+import re
+import pandas as pd
+
+# Path to the directory containing your PDF files (replace with your actual path)
+pdf_folder_path = "./pdfs"  # Example: './cv_pdfs' or use absolute path like 'C:/Users/John/cv_pdfs'
+
+# List all PDF files in the folder
+pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')]
+
+# Initialize an empty list to store the extracted data
+extracted_data = []
+
+# Function to clean the text (remove unnecessary spaces, special characters, etc.)
+def clean_text(text):
+    text = text.strip()  # Remove leading/trailing spaces
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
+    return text
+
+
+# Function to extract email address from text
+def extract_email(text):
+    email_regex = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
+    match = re.search(email_regex, text)
+    return match.group(0) if match else None
+
+
+# Iterate through each PDF file in the folder
+for idx, pdf_file in enumerate(pdf_files):
+    pdf_path = os.path.join(pdf_folder_path, pdf_file)  # Join folder path and filename
+    
+    # Open the PDF file
+    with pdfplumber.open(pdf_path) as pdf:
+        full_text = ""
+        
+        # Iterate through each page of the PDF
+        for page_number, page in enumerate(pdf.pages):
+            # Extract text from the page
+            text = page.extract_text()
+            
+            if text:  # Ensure that text was extracted (non-empty text)
+                # Clean the text
+                cleaned_text = clean_text(text)
+                full_text += cleaned_text  # Combine all text from all pages
+
+        email = extract_email(full_text)
+        
+        # Store the extracted data as an object with 'id', 'name', 'phone', 'email', and 'text' keys
+        if full_text:  # Only store if text was extracted
+            extracted_data.append({
+                email: full_text  # Full extracted text from the CV
+            })
+
+# Now extracted_data contains an array of objects with 'id', 'name', 'phone', 'email', and 'text'
+# Example of the result:
+# extracted_data = [
+#     {"id": 1, "name": "John Doe", "phone": "123-456-7890", "email": "john.doe@example.com", "text": "Full text of CV 1..."},
+#     {"id": 2, "name": "Jane Smith", "phone": "987-654-3210", "email": "jane.smith@example.com", "text": "Full text of CV 2..."},
+#     {"id": 3, "name": "Alice Johnson", "phone": "555-123-4567", "email": "alice.johnson@example.com", "text": "Full text of CV 3..."}
+# ]
+
+# Print the resulting dataset
+print(extracted_data)
+
+# Optionally: Save to a CSV or JSON file
+import json
+with open("extracted_cvs.json", "w") as json_file:
+    json.dump(extracted_data, json_file, indent=4)
+
+# Or save it as a CSV (each row has an 'id', 'name', 'phone', 'email', 'text' column)
+df = pd.DataFrame(extracted_data)
+df.to_csv("extracted_cvs.csv", index=False)
--- a/src/pdfs/MUSKAN_JAIN_UPDATED_CV (3).pdf
+++ b/src/pdfs/MUSKAN_JAIN_UPDATED_CV (3).pdf
--- a/src/pdfs/PRAYAS_JAIN_UI_CV.pdf
+++ b/src/pdfs/PRAYAS_JAIN_UI_CV.pdf
--- a/src/pdfs/resume.pdf
+++ b/src/pdfs/resume.pdf
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -6,4 +6,6 @@ streamlit
 fastapi
 uvicorn
 requests
-router
\ No newline at end of file
+PyPDF2
+pdfplumber
+pandas
\ No newline at end of file