-
Notifications
You must be signed in to change notification settings - Fork 0
/
text2wordcounts.py
executable file
·41 lines (33 loc) · 1.2 KB
/
text2wordcounts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 30 09:19:05 2014
@author: mwilkens
Input: One fulltext file, named as first commandline argument.
Output: CSV list of wordcounts, formatted like JSTOR DfR, to stdout.
Usage: text2wordcounts <input_file.txt>
"""
import sys, string
args = sys.argv
text_file = args[1]
# Dictionary to store wordcounts
words = {}
# Set up punctuation removal
replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
# Read the file and count occurrences of each unique word
with open(text_file, 'r') as text:
# Count words, one line at a time
for line in text:
line = line.lower()
line = line.translate(replace_punctuation)
tempwords = line.split(None) # create a list of words
for i in tempwords:
i = i.strip() # Get rid of extra space
if i != '': # No empty strings, damnit!
if i in words:
words[i] += 1
else:
words[i] = 1
print "WORDCOUNTS,WEIGHT" # DfR-style header
for i in sorted(words, key=words.get, reverse=True):
print str(i)+','+str(words[i])