Let's compare:
from subprocess import check_output
def wc(filename):
return int(check_output(["wc", "-l", filename]).split()[0])
def native(filename):
c = 0
with open(filename) as file:
while True:
chunk = file.read(10 ** 7)
if chunk == "":
return c
c += chunk.count("\n")
def iterate(filename):
with open(filename) as file:
for i, line in enumerate(file):
pass
return i + 1
Go go timeit function!
from timeit import timeit
from sys import argv
filename = argv[1]
def testwc():
wc(filename)
def testnative():
native(filename)
def testiterate():
iterate(filename)
print "wc", timeit(testwc, number=10)
print "native", timeit(testnative, number=10)
print "iterate", timeit(testiterate, number=10)
Result:
wc 1.25185894966
native 2.47028398514
iterate 2.40715694427
So, wc is about twice as fast on a 150 MB compressed files with ~500 000 linebreaks, which is what I tested on. However, testing on a file generated with seq 3000000 >bigfile, I get these numbers:
wc 0.425990104675
native 0.400163888931
iterate 3.10369205475
Hey look, python FTW! However, using longer lines (~70 chars):
wc 1.60881590843
native 3.24313092232
iterate 4.92839002609
So conclusion: it depends, but wc seems to be the best bet allround.