5
\$\begingroup\$

This is my first Rust project (I'm primarily a C guy). I want to know if my code is properly idiomatic (Rusty?). Also, are there any inefficiencies?

The code defines an iterator (Searcher) that takes a regex, a starting directory, and an optional maximum search depth. It generates locations of regex matches within the file system.

use std::fs;
use std::io;
use std::io::BufRead;
use std::iter;
use std::path;
use std::vec::Vec;

use regex;

pub struct Location {
    pub file: path::PathBuf,
    pub line: usize,
    pub text: String,
}

struct FileScanner<'a> {
    path: path::PathBuf,
    pattern: &'a regex::Regex,
    lines: iter::Enumerate<io::Lines<io::BufReader<fs::File>>>,
}

fn is_line_printable(line: &str) -> bool {
    line.chars()
        .all(|c| c.is_ascii_graphic() || c.is_whitespace())
}

impl<'a> FileScanner<'a> {
    fn build(path: path::PathBuf, pattern: &'a regex::Regex) -> Option<Self> {
        let handle = match fs::File::open(&path) {
            Ok(h) => h,
            Err(_) => return None,
        };
        let reader = io::BufReader::new(handle);
        Some(FileScanner {
            path,
            pattern,
            lines: reader.lines().into_iter().enumerate(),
        })
    }
}

impl<'a> Iterator for FileScanner<'a> {
    type Item = Location;

    fn next(&mut self) -> Option<Location> {
        loop {
            let (index, line) = match self.lines.next() {
                Some((i, Ok(l))) => (i, l),
                _ => return None,
            };
            if !is_line_printable(&line) {
                return None;
            }

            let pattern_match = match self.pattern.find(&line) {
                Some(m) => m,
                None => continue,
            };
            let start = pattern_match.start();
            let end = pattern_match.end();
            return Some(Location {
                file: self.path.to_path_buf(),
                line: index + 1,
                text: String::from(&line[start..end]),
            });
        }
    }
}

pub struct Searcher<'a> {
    pattern: &'a regex::Regex,
    max_depth: Option<u8>,
    readers: Vec<fs::ReadDir>,
    current_scanner: Option<FileScanner<'a>>,
}

impl<'a> Searcher<'a> {
    pub fn build(
        pattern: &'a regex::Regex,
        directory: &'a path::Path,
        depth: Option<u8>,
    ) -> Result<Self, String> {
        match depth {
            Some(0) => return Err(String::from("Depth cannot be 0")),
            _ => (),
        };

        let reader = match fs::read_dir(directory) {
            Ok(r) => r,
            Err(error) => return Err(error.to_string()),
        };
        let readers = vec![reader];

        Ok(Searcher {
            pattern,
            max_depth: depth,
            readers,
            current_scanner: None,
        })
    }

    fn push_directory(&mut self, directory: path::PathBuf) {
        match self.max_depth {
            Some(depth) if usize::from(depth) == self.readers.len() => return,
            _ => (),
        }

        let reader = match fs::read_dir(directory) {
            Ok(r) => r,
            Err(_) => return,
        };
        self.readers.push(reader);
    }

    fn next_match_from_file(&mut self) -> Option<Location> {
        let scanner = match &mut self.current_scanner {
            Some(s) => s,
            None => return None,
        };

        let location = scanner.next();
        if location.is_none() {
            self.current_scanner = None;
        }
        location
    }
}

impl<'a> Iterator for Searcher<'a> {
    type Item = Location;

    fn next(&mut self) -> Option<Location> {
        let location = self.next_match_from_file();
        if location.is_some() {
            return location;
        }

        while self.readers.len() > 0 {
            let current_reader = self.readers.last_mut().unwrap();
            let entry = match current_reader.next() {
                Some(Ok(ent)) => ent,
                Some(Err(_)) | None => {
                    self.readers.pop();
                    continue;
                }
            };

            let path = entry.path();
            if path.is_dir() {
                self.push_directory(path);
                continue;
            }

            self.current_scanner = FileScanner::build(path, self.pattern);
            let location = self.next_match_from_file();
            if location.is_some() {
                return location;
            }
        }

        None
    }
}
\$\endgroup\$

1 Answer 1

3
\$\begingroup\$

You did a good job, your code already leverages on main rust features, but there are some suggestions to make your code more idiomatic and efficient:

Idiomatic improvements

1. Use ? for error propagation

Instead of manual match expressions:

// Current approach
let reader = match fs::read_dir(directory) {
    Ok(r) => r,
    Err(error) => return Err(error.to_string()),
};

// More idiomatic
let reader = fs::read_dir(directory).map_err(|e| e.to_string())?;

2. Use if let for simpler pattern matching (it's cleaner)

// Instead of
match depth {
    Some(0) => return Err(String::from("Depth cannot be 0")),
    _ => (),
};

// More idiomatic
if let Some(0) = depth {
    return Err("Depth cannot be 0".to_string());
}

3. Avoid unnecessary empty match arms

// Instead of
match self.max_depth {
    Some(depth) if usize::from(depth) == self.readers.len() => return,
    _ => (),
}

// Better
if let Some(depth) = self.max_depth {
    if usize::from(depth) == self.readers.len() {
        return;
    }
}

Performance considerations

  1. Binary file detection: Your is_line_printable function reads every character of every line. Consider checking a sample of bytes at the beginning of the file instead.

  2. Path cloning: You're creating a new PathBuf for each match with self.path.to_path_buf(). This is sometimes unavoidable (unfortunately) but worth noting.

  3. File filtering: Add extension filtering to avoid scanning binary files altogether.

  4. Memory usage: The current approach holds file handles open until matches are found, which is generally fine but could be improved.

Here's an improved version of your code:

use std::fs;
use std::io::{self, BufRead};
use std::iter;
use std::path::{Path, PathBuf};
use std::vec::Vec;

use regex::Regex;

pub struct Location {
    pub file: PathBuf,
    pub line: usize,
    pub line_content: String, // store whole line for context
    pub match_text: String,   // store matched text separately
    pub match_span: (usize, usize), // start and end positions
}

struct FileScanner<'a> {
    path: PathBuf,
    pattern: &'a Regex,
    lines: iter::Enumerate<io::Lines<io::BufReader<fs::File>>>,
}

// improved binary detection - (check first few bytes)
fn is_likely_binary(path: &Path) -> io::Result<bool> {
    let file = fs::File::open(path)?;
    let mut buffer = [0; 512]; // sample first 512 bytes
    let bytes_read = file.take(512).read(&mut buffer)?;
    
    // check for null bytes or high concentration of non-ASCII chars
    let non_text_chars = buffer[..bytes_read]
        .iter()
        .filter(|&&b| b == 0 || b > 127)
        .count();
        
    Ok(non_text_chars > bytes_read / 4) // > 25% non-text chars indicates binary
}

impl<'a> FileScanner<'a> {
    fn new(path: PathBuf, pattern: &'a Regex) -> io::Result<Self> {
        let handle = fs::File::open(&path)?;
        let reader = io::BufReader::new(handle);
        
        Ok(FileScanner {
            path,
            pattern,
            lines: reader.lines().enumerate(),
        })
    }
    
    fn build(path: PathBuf, pattern: &'a Regex) -> Option<Self> {
        if is_likely_binary(&path).unwrap_or(true) {
            return None; // skip
        }
        
        Self::new(path).ok()
    }
}

impl<'a> Iterator for FileScanner<'a> {
    type Item = Location;

    fn next(&mut self) -> Option<Location> {
        while let Some((index, line_result)) = self.lines.next() {
            let line = match line_result {
                Ok(l) => l,
                Err(_) => continue, // skip lines with IO errors instead of aborting
            };

            if let Some(pattern_match) = self.pattern.find(&line) {
                let start = pattern_match.start();
                let end = pattern_match.end();
                
                return Some(Location {
                    file: self.path.clone(), // clone is necessary here
                    line: index + 1,
                    line_content: line.clone(),
                    match_text: line[start..end].to_string(),
                    match_span: (start, end),
                });
            }
        }
        None
    }
}

pub struct Searcher<'a> {
    pattern: &'a Regex,
    max_depth: Option<u8>,
    readers: Vec<fs::ReadDir>,
    current_scanner: Option<FileScanner<'a>>,
    allowed_extensions: Option<Vec<String>>, // add extension-based filtering
}

impl<'a> Searcher<'a> {
    pub fn new(
        pattern: &'a Regex,
        directory: &Path,
        depth: Option<u8>,
        extensions: Option<Vec<String>>,
    ) -> Result<Self, String> {
        if let Some(0) = depth {
            return Err("Depth cannot be 0".to_string());
        }

        let reader = fs::read_dir(directory).map_err(|e| e.to_string())?;
        
        Ok(Searcher {
            pattern,
            max_depth: depth,
            readers: vec![reader],
            current_scanner: None,
            allowed_extensions: extensions,
        })
    }
    
    pub fn build(
        pattern: &'a Regex,
        directory: &Path,
        depth: Option<u8>,
    ) -> Result<Self, String> {
        Self::new(pattern, directory, depth, None)
    }

    fn should_scan_file(&self, path: &Path) -> bool {
        if let Some(ref extensions) = self.allowed_extensions {
            if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
                return extensions.iter().any(|e| e == ext);
            }
            return false; // no extension
        }
        true // no extension filtering
    }

    fn push_directory(&mut self, directory: PathBuf) {
        // early return if we've reached max depth
        if let Some(depth) = self.max_depth {
            if usize::from(depth) == self.readers.len() {
                return;
            }
        }

        if let Ok(reader) = fs::read_dir(directory) {
            self.readers.push(reader);
        }
    }

    fn next_match_from_file(&mut self) -> Option<Location> {
        let location = self.current_scanner.as_mut()?.next();
        if location.is_none() {
            self.current_scanner = None;
        }
        location
    }
}

impl<'a> Iterator for Searcher<'a> {
    type Item = Location;

    fn next(&mut self) -> Option<Location> {
        if let Some(location) = self.next_match_from_file() {
            return Some(location);
        }

        while let Some(reader) = self.readers.last_mut() {
            // here we search through all directories
            match reader.next() {
                Some(Ok(entry)) => {
                    let path = entry.path();
                    
                    if path.is_dir() {
                        self.push_directory(path);
                    } else if self.should_scan_file(&path) {
                        self.current_scanner = FileScanner::build(path, self.pattern);
                        if let Some(location) = self.next_match_from_file() {
                            return Some(location);
                        }
                    }
                }
                _ => {
                    // if it reaches here it means there's an error
                    self.readers.pop();
                }
            }
        }

        None
    }
}
\$\endgroup\$

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.