Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
cv_filter_tool
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Lokesh Singh
cv_filter_tool
Commits
45017fdf
Commit
45017fdf
authored
Jan 18, 2025
by
Prayas Jain
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
pdf code
parent
79fb0e78
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
77 additions
and
2 deletions
+77
-2
main.py
src/main.py
+1
-1
pdf.py
src/pdf.py
+73
-0
MUSKAN_JAIN_UPDATED_CV (3).pdf
src/pdfs/MUSKAN_JAIN_UPDATED_CV (3).pdf
+0
-0
PRAYAS_JAIN_UI_CV.pdf
src/pdfs/PRAYAS_JAIN_UI_CV.pdf
+0
-0
resume.pdf
src/pdfs/resume.pdf
+0
-0
requirements.txt
src/requirements.txt
+3
-1
No files found.
src/main.py
View file @
45017fdf
from
fastapi
import
FastAPI
from
src.
api
import
router
from
api
import
router
app
=
FastAPI
()
...
...
src/pdf.py
0 → 100644
View file @
45017fdf
import
os
import
pdfplumber
import
re
import
pandas
as
pd
# Path to the directory containing your PDF files (replace with your actual path)
pdf_folder_path
=
"./pdfs"
# Example: './cv_pdfs' or use absolute path like 'C:/Users/John/cv_pdfs'
# List all PDF files in the folder
pdf_files
=
[
f
for
f
in
os
.
listdir
(
pdf_folder_path
)
if
f
.
endswith
(
'.pdf'
)]
# Initialize an empty list to store the extracted data
extracted_data
=
[]
# Function to clean the text (remove unnecessary spaces, special characters, etc.)
def
clean_text
(
text
):
text
=
text
.
strip
()
# Remove leading/trailing spaces
text
=
re
.
sub
(
r'\s+'
,
' '
,
text
)
# Replace multiple spaces with a single space
return
text
# Function to extract email address from text
def
extract_email
(
text
):
email_regex
=
r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
match
=
re
.
search
(
email_regex
,
text
)
return
match
.
group
(
0
)
if
match
else
None
# Iterate through each PDF file in the folder
for
idx
,
pdf_file
in
enumerate
(
pdf_files
):
pdf_path
=
os
.
path
.
join
(
pdf_folder_path
,
pdf_file
)
# Join folder path and filename
# Open the PDF file
with
pdfplumber
.
open
(
pdf_path
)
as
pdf
:
full_text
=
""
# Iterate through each page of the PDF
for
page_number
,
page
in
enumerate
(
pdf
.
pages
):
# Extract text from the page
text
=
page
.
extract_text
()
if
text
:
# Ensure that text was extracted (non-empty text)
# Clean the text
cleaned_text
=
clean_text
(
text
)
full_text
+=
cleaned_text
# Combine all text from all pages
email
=
extract_email
(
full_text
)
# Store the extracted data as an object with 'id', 'name', 'phone', 'email', and 'text' keys
if
full_text
:
# Only store if text was extracted
extracted_data
.
append
({
email
:
full_text
# Full extracted text from the CV
})
# Now extracted_data contains an array of objects with 'id', 'name', 'phone', 'email', and 'text'
# Example of the result:
# extracted_data = [
# {"id": 1, "name": "John Doe", "phone": "123-456-7890", "email": "john.doe@example.com", "text": "Full text of CV 1..."},
# {"id": 2, "name": "Jane Smith", "phone": "987-654-3210", "email": "jane.smith@example.com", "text": "Full text of CV 2..."},
# {"id": 3, "name": "Alice Johnson", "phone": "555-123-4567", "email": "alice.johnson@example.com", "text": "Full text of CV 3..."}
# ]
# Print the resulting dataset
print
(
extracted_data
)
# Optionally: Save to a CSV or JSON file
import
json
with
open
(
"extracted_cvs.json"
,
"w"
)
as
json_file
:
json
.
dump
(
extracted_data
,
json_file
,
indent
=
4
)
# Or save it as a CSV (each row has an 'id', 'name', 'phone', 'email', 'text' column)
df
=
pd
.
DataFrame
(
extracted_data
)
df
.
to_csv
(
"extracted_cvs.csv"
,
index
=
False
)
src/pdfs/MUSKAN_JAIN_UPDATED_CV (3).pdf
0 → 100644
View file @
45017fdf
File added
src/pdfs/PRAYAS_JAIN_UI_CV.pdf
0 → 100644
View file @
45017fdf
File added
src/pdfs/resume.pdf
0 → 100644
View file @
45017fdf
File added
src/requirements.txt
View file @
45017fdf
...
...
@@ -6,4 +6,6 @@ streamlit
fastapi
uvicorn
requests
router
\ No newline at end of file
PyPDF2
pdfplumber
pandas
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment