Skip to content

Commit b57588b

Browse files
committed
First Commit
1 parent 790e4e2 commit b57588b

40 files changed

+3068
-0
lines changed

.idea/Python-OCR-Django.iml

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/vcs.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

OCR_Django_App/__init__.py

Whitespace-only changes.

OCR_Django_App/settings.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
"""
2+
Django settings for OCR_Django_App project.
3+
4+
Generated by 'django-admin startproject' using Django 1.11.20.
5+
6+
For more information on this file, see
7+
https://docs.djangoproject.com/en/1.11/topics/settings/
8+
9+
For the full list of settings and their values, see
10+
https://docs.djangoproject.com/en/1.11/ref/settings/
11+
"""
12+
13+
import os
14+
15+
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17+
18+
19+
# Quick-start development settings - unsuitable for production
20+
# See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/
21+
22+
# SECURITY WARNING: keep the secret key used in production secret!
23+
SECRET_KEY = '=!3*5*bo4t)6=7el(1w1n^x6$9fd-03&k&q1w_14y$fsx(nsb)'
24+
25+
# SECURITY WARNING: don't run with debug turned on in production!
26+
DEBUG = True
27+
28+
ALLOWED_HOSTS = []
29+
30+
STATICFILES_DIRS = (
31+
os.path.join(os.path.dirname(__file__), 'static').replace('\\','/'),
32+
)
33+
34+
# Application definition
35+
36+
INSTALLED_APPS = [
37+
'index.apps.IndexConfig',
38+
'django.contrib.admin',
39+
'django.contrib.auth',
40+
'django.contrib.contenttypes',
41+
'django.contrib.sessions',
42+
'django.contrib.messages',
43+
'django.contrib.staticfiles',
44+
]
45+
46+
MIDDLEWARE = [
47+
'django.middleware.security.SecurityMiddleware',
48+
'django.contrib.sessions.middleware.SessionMiddleware',
49+
'django.middleware.common.CommonMiddleware',
50+
'django.middleware.csrf.CsrfViewMiddleware',
51+
'django.contrib.auth.middleware.AuthenticationMiddleware',
52+
'django.contrib.messages.middleware.MessageMiddleware',
53+
'django.middleware.clickjacking.XFrameOptionsMiddleware',
54+
]
55+
56+
ROOT_URLCONF = 'OCR_Django_App.urls'
57+
58+
TEMPLATES = [
59+
{
60+
'BACKEND': 'django.template.backends.django.DjangoTemplates',
61+
'DIRS': [],
62+
'APP_DIRS': True,
63+
'OPTIONS': {
64+
'context_processors': [
65+
'django.template.context_processors.debug',
66+
'django.template.context_processors.request',
67+
'django.contrib.auth.context_processors.auth',
68+
'django.contrib.messages.context_processors.messages',
69+
],
70+
},
71+
},
72+
]
73+
74+
WSGI_APPLICATION = 'OCR_Django_App.wsgi.application'
75+
76+
77+
# Database
78+
# https://docs.djangoproject.com/en/1.11/ref/settings/#databases
79+
80+
DATABASES = {
81+
'default': {
82+
'ENGINE': 'django.db.backends.sqlite3',
83+
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
84+
}
85+
}
86+
87+
88+
# Password validation
89+
# https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators
90+
91+
AUTH_PASSWORD_VALIDATORS = [
92+
{
93+
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
94+
},
95+
{
96+
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
97+
},
98+
{
99+
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
100+
},
101+
{
102+
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
103+
},
104+
]
105+
106+
107+
# Internationalization
108+
# https://docs.djangoproject.com/en/1.11/topics/i18n/
109+
110+
LANGUAGE_CODE = 'en-us'
111+
112+
TIME_ZONE = 'UTC'
113+
114+
USE_I18N = True
115+
116+
USE_L10N = True
117+
118+
USE_TZ = True
119+
120+
121+
# Static files (CSS, JavaScript, Images)
122+
# https://docs.djangoproject.com/en/1.11/howto/static-files/
123+
124+
STATIC_URL = '/static/'

OCR_Django_App/urls.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from django.contrib import admin
2+
#from django.urls import path
3+
from django.conf.urls import url, include
4+
5+
urlpatterns = [
6+
url(r'admin/', admin.site.urls),
7+
url(r'index/', include('index.urls')),
8+
]

OCR_Django_App/wsgi.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""
2+
WSGI config for OCR_Django_App project.
3+
4+
It exposes the WSGI callable as a module-level variable named ``application``.
5+
6+
For more information on this file, see
7+
https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/
8+
"""
9+
10+
import os
11+
12+
from django.core.wsgi import get_wsgi_application
13+
14+
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "OCR_Django_App.settings")
15+
16+
application = get_wsgi_application()

__init__.py

Whitespace-only changes.

db.sqlite3

3 KB
Binary file not shown.

index/__init__.py

Whitespace-only changes.

index/admin.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import unicode_literals
3+
4+
from django.contrib import admin
5+
6+
# Register your models here.

index/apps.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import unicode_literals
3+
4+
from django.apps import AppConfig
5+
6+
7+
class IndexConfig(AppConfig):
8+
name = 'index'

index/migrations/__init__.py

Whitespace-only changes.

index/models.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding: utf-8 -*-
2+
from django.db import models
3+
4+
# Create your models here.
5+
6+
class FileName(models.Model):
7+
file_name = models.CharField(max_length=500)
8+
file_extension = models.CharField(max_length=100)

index/source/__init__.py

Whitespace-only changes.

index/source/utils.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import os
2+
3+
from pdf2image import convert_from_path
4+
import re
5+
import numpy as np
6+
7+
# For importing PyTesseract
8+
try:
9+
from PIL import Image
10+
except ImportError:
11+
import Image
12+
import pytesseract
13+
14+
# If you don't have tesseract executable in your PATH, include the following:
15+
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
16+
17+
'''
18+
Function for merging PNG's
19+
'''
20+
def merge_pdf (folder_path):
21+
list_im = []
22+
for root, dir, files in os.walk(folder_path):
23+
for singFile in files:
24+
if str(singFile).endswith(".png"):
25+
new_path = folder_path + "\\" + singFile
26+
list_im.append(new_path)
27+
28+
# Getting current folder name
29+
folder_name = str(os.path.basename(folder_path))
30+
# Creating PNG Name through folder name
31+
file_name = folder_name[:-4]
32+
33+
imgs = [Image.open(i) for i in list_im]
34+
min_shape = sorted([(np.sum(i.size), i.size) for i in imgs])[0][1]
35+
36+
# for a vertical stacking it is simple: use vstack
37+
imgs_comb = np.vstack((np.asarray(i.resize(min_shape)) for i in imgs))
38+
imgs_comb = Image.fromarray(imgs_comb)
39+
imgs_comb.save(folder_path + "\\" + file_name + '.png', 'PNG')
40+
41+
# For removing PDF's Page PNG's
42+
for image in list_im:
43+
os.remove(image)
44+
print "File: " + str(image) + " removed."
45+
46+
'''
47+
Function for processing OCR of Single Image File
48+
'''
49+
# def multi_file_tesseract(path, folder_path, single_image):
50+
def multi_file_tesseract(path, folder_path):
51+
52+
for root, dir, files in os.walk(path):
53+
for Singfile in files:
54+
print Singfile
55+
if str(Singfile).endswith(".png"):
56+
57+
file_path = (root + "\\" + Singfile)
58+
print "this: " + file_path
59+
60+
file_name = Singfile.split(".pdf")
61+
final_name = file_name[0]
62+
# Simple image to string
63+
ocr_text = (pytesseract.image_to_string(Image.open(file_path)))
64+
65+
# Creating TXT Files
66+
text_file = open(folder_path + "\\" + str(final_name) + ".txt", "a")
67+
text_file.write(str(ocr_text))
68+
text_file.close()
69+
70+
#return ocr_text
71+
72+
73+
'''
74+
For OCR'ing a single file and saving it in same directory
75+
'''
76+
def single_file_tesseract(path):
77+
print(path)
78+
79+
image_formats = ['.png', '.jpg', '.jpeg']
80+
81+
for format in image_formats:
82+
if format in path:
83+
# Simple image to string
84+
ocr_text = (pytesseract.image_to_string(Image.open(path)))
85+
else:
86+
ocr_text = "File not supported."
87+
88+
# Creating TXT Files
89+
text_file = open(path + ".txt", "a")
90+
text_file.write(str(ocr_text))
91+
#print ocr_text
92+
93+
return ocr_text
94+
95+
'''
96+
Function for converting PDF to image
97+
'''
98+
def pdf_to_image(file_path):
99+
pages = convert_from_path(file_path, 300)
100+
pdf_file = file_path[:-4]
101+
102+
# For getting file name of the PDF
103+
newFile = str(file_path).split("\\")
104+
file_name = str(newFile[len(newFile)-1])
105+
file_name_proper = file_name[:-4]
106+
107+
# Replacing file name in path to create PNG folder.
108+
new_pdf_path = str(pdf_file).replace(file_name_proper, "")
109+
110+
# New path for placing PNG's.
111+
create_folder = new_pdf_path + "\\PNG\\" + file_name
112+
print "This is the path that I need to pick up: " + create_folder
113+
if not os.path.exists(create_folder):
114+
os.makedirs(create_folder)
115+
else:
116+
print "Folder Already Exists."
117+
118+
# To save PNG's in same folder as name.
119+
new_png_path = create_folder + "\\" + file_name
120+
for page in pages:
121+
page.save("%s-page%d.png" % (new_png_path, pages.index(page)), "PNG")
122+
123+
alert = "JPEG has been created."
124+
125+
#merge_pdf(create_folder)
126+
127+
return create_folder

0 commit comments

Comments
 (0)