#!/usr/bin/perl -w =head1 NAME find-text-files - traverse a file tree and guess plain text files =head1 SYNOPSIS find-text-files [options] dir ... =head1 DESCRIPTION This program traverse a file tree, guess plain text files and outputs their names to STDOUT. =cut require 5.004; use strict; use integer; use File::Find; use Getopt::Long; use IPC::Open2; sub usage { warn "\n".join(" ", @_)."\n" if @_; warn < \$ help_option, 'exclude=s' => \@ exclude_options, 'include=s' => \@ include_options, 'total' => \$ total_option, 'excluded' => \$ excluded_option, 'included' => \$ included_option, 'selectors' => \$selectors_option, ) or usage; usage if $help_option; my %bin_suffices; my %txt_suffices; BEGIN { map { $bin_suffices{$_} = undef } ( 'gif', 'tif', 'tiff', 'png', 'jpg', 'jpeg', 'avi', 'mpg', 'mpeg', 'o', 'obj', 'exe', 'cab', 'a', 'rar', 'arj', 'zip', 'tar', 'cpio', 'z', 'gz', 'bz', 'bz2', 'tgz', 'tbz', 'tbz2', 'iso', 'bin', 'img', 'imag', 'image', 'diff', 'patch' # diff/patch files could have EOL spaces! ); map { $txt_suffices{$_} = undef } ( 'txt', 'text', 'html', 'htm', 'xml', 'php', 'c', 'cpp', 'c++', 'cc', 'cxx', 'h', 'hpp', 'h++', 'hh', 'hxx', 'asm', 'inc', 'mod', 'for', 'f77', 'g77', 'java', 'jav', 'bas', 'vb', 'pl', 'pm', 'pod', 'make', 'mak', 'mk', 'awk', 'sh', 'bat', 'cmd', 'rexx', 'rex', 'sql', 'def', 'man', 'cvsignore' ); } my $exclude_re = '(,v$)'; map { $exclude_re .= '|('.lc $_.')'; } @exclude_options; my $include_re = '(^makefile$)'; map { $include_re .= '|('.lc $_.')'; } @include_options; if ($selectors_option) { my $bin_suffices = join(" ", sort keys %bin_suffices); my $txt_suffices = join(" ", sort keys %txt_suffices); print STDERR "\n"; print STDERR "Exclude RE: ".$exclude_re."\n"; print STDERR "\n"; print STDERR "Include RE: ".$include_re."\n"; print STDERR "\n"; print STDERR "Exclude suffices: ".$bin_suffices."\n"; print STDERR "\n"; print STDERR "Include suffices: ".$txt_suffices."\n"; print STDERR "\n"; exit 0; } scalar(@ARGV) >= 1 or usage("no directory specified"); my ( $total_files_checked, $total_files_empty, $total_files_excluded_by_re, $total_files_included_by_re, $total_files_excluded_by_suffix, $total_files_included_by_suffix, $total_files_excluded_by_file, $total_files_included_by_file ) = (0,0,0,0,0,0,0,0); sub _by($$$$) { my ($inex_option, $inex_str, $by, $name) = @_; printf(STDERR "%scluded by %13s: %s\n", $inex_str, $by, $name) if $inex_option; } sub inby($$) { _by($included_option, 'in', $_[0], $_[1]); } sub exby($$) { _by($excluded_option, 'ex', $_[0], $_[1]); } local *FILE_RH; local *FILE_WH; my $file_pid; $SIG{PIPE} = sub { close FILE_WH; waitpid $file_pid, 0; die "file(1) pipe broken" }; $file_pid = open2(\*FILE_RH, \*FILE_WH, "file -n -f -" ) or die "can't fork: $!"; #+ main work $| = 1; # STDOUT autoflush find(\&onfile, @ARGV); #- main work close FILE_WH; waitpid $file_pid, 0; format STDERR = Total files: checked empty ------- ------- @>>>>>> @>>>>>> $total_files_checked, $total_files_empty suffix re file(1) ------- ------- ------- excluded by: @>>>>>> @>>>>>> @>>>>>> $total_files_excluded_by_suffix, $total_files_excluded_by_re, $total_files_excluded_by_file included by: @>>>>>> @>>>>>> @>>>>>> $total_files_included_by_suffix, $total_files_included_by_re, $total_files_included_by_file . write STDERR if $total_option; exit 0; sub onfile() { my $shortname = $_; my $ fullname = "$File::Find::name"; return unless -f $shortname; $total_files_checked++; if ( ! -s $shortname ) { $total_files_empty++; return; } my $lcshortname = lc $shortname; if ( $lcshortname =~ m/$exclude_re/o ) { exby('RE', $fullname); $total_files_excluded_by_re++; return; } if ( $lcshortname =~ m/$include_re/o ) { inby('RE', $fullname); $total_files_included_by_re++; } else # check by suffix { my $suffix = $1 if $lcshortname =~ m/\.([^\.]+)$/; if ( defined $suffix and length $suffix and exists $bin_suffices{$suffix} ) { exby('binary suffix', $fullname); $total_files_excluded_by_suffix++; return; } if ( defined $suffix and length $suffix and exists $txt_suffices{$suffix} ) { inby('text suffix', $fullname); $total_files_included_by_suffix++; } else # check by file(1) { print FILE_WH $fullname."\n" or die "bad write to file(1) pipe: $! $?"; my $fread = ; defined $fread or die "bad read from file(1) pipe: $! $?"; chomp $fread; unless ( $fread =~ m|^(.+):\s+(.+)$| ) { die "file(1) output does not match pattern:\n$fread\n"; } my ($fname,$fdesc) = ($1,$2); die "can't parse file(1) output:\n$fread\n" if (! defined $fname) or (! defined $fdesc); die "file name after file(1) does not match the original one:\n". "\tbefore: $fullname\n". "\tafter : $fname\n" if $fname ne $fullname; if ( $fdesc =~ m/^.* (text)|(source).*$/ ) { inby('file(1)', $fullname); $total_files_included_by_file++; } else { exby('file(1)', $fullname); $total_files_excluded_by_file++; return; } } } print $fullname . "\n"; } =head1 AUTHOR Dmitry Fedorov =head1 COPYRIGHT Copyright (C) 2003 Dmitry Fedorov =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. =head1 DISCLAIMER The author disclaims any responsibility for any mangling of your system etc, that this script may cause. =cut