>From 872d7f08c47fa382ae18171a0806afa110de8fbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simen=20Heggest=C3=B8yl?= Date: Sun, 8 May 2022 16:01:35 +0200 Subject: [PATCH] Add CSV separator guessing functionality Add two new commands: `csv-guess-set-separator' that automatically guesses and sets the CSV separator of the current buffer, and `csv-set-separator' for setting it manually. `csv-guess-set-separator' can be useful to add to the mode hook to have CSV mode guess and set the separator automatically when visiting a buffer: (add-hook 'csv-mode-hook 'csv-guess-set-separator) * csv-mode.el (csv-separators): Properly quote regexp values. (csv--set-separator-history, csv--preferred-separators): New variables. (csv-set-separator, csv-guess-set-separator) (csv-guess-separator, csv--separator-candidates) (csv--separator-score): New functions. * csv-mode-tests.el (csv-tests--data): New test data. (csv-tests-guess-separator, csv-tests-separator-candidates) (csv-tests-separator-score): New tests. --- csv-mode-tests.el | 80 ++++++++++++++++++++------- csv-mode.el | 137 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 187 insertions(+), 30 deletions(-) diff --git a/csv-mode-tests.el b/csv-mode-tests.el index 316dc4bb93..0caeab7d80 100644 --- a/csv-mode-tests.el +++ b/csv-mode-tests.el @@ -1,8 +1,8 @@ ;;; csv-mode-tests.el --- Tests for CSV mode -*- lexical-binding: t; -*- -;; Copyright (C) 2020 Free Software Foundation, Inc +;; Copyright (C) 2020-2022 Free Software Foundation, Inc -;; Author: Simen Heggestøyl +;; Author: Simen Heggestøyl ;; Keywords: ;; This program is free software; you can redistribute it and/or modify @@ -28,83 +28,121 @@ (require 'csv-mode) (eval-when-compile (require 'subr-x)) -(ert-deftest csv-mode-tests-end-of-field () +(ert-deftest csv-tests-end-of-field () (with-temp-buffer (csv-mode) (insert "aaa,bbb") (goto-char (point-min)) (csv-end-of-field) - (should (equal (buffer-substring (point-min) (point)) - "aaa")) + (should (equal (buffer-substring (point-min) (point)) "aaa")) (forward-char) (csv-end-of-field) (should (equal (buffer-substring (point-min) (point)) "aaa,bbb")))) -(ert-deftest csv-mode-tests-end-of-field-with-quotes () +(ert-deftest csv-tests-end-of-field-with-quotes () (with-temp-buffer (csv-mode) (insert "aaa,\"b,b\"") (goto-char (point-min)) (csv-end-of-field) - (should (equal (buffer-substring (point-min) (point)) - "aaa")) + (should (equal (buffer-substring (point-min) (point)) "aaa")) (forward-char) (csv-end-of-field) (should (equal (buffer-substring (point-min) (point)) "aaa,\"b,b\"")))) -(ert-deftest csv-mode-tests-beginning-of-field () +(ert-deftest csv-tests-beginning-of-field () (with-temp-buffer (csv-mode) (insert "aaa,bbb") (csv-beginning-of-field) - (should (equal (buffer-substring (point) (point-max)) - "bbb")) + (should (equal (buffer-substring (point) (point-max)) "bbb")) (backward-char) (csv-beginning-of-field) (should (equal (buffer-substring (point) (point-max)) "aaa,bbb")))) -(ert-deftest csv-mode-tests-beginning-of-field-with-quotes () +(ert-deftest csv-tests-beginning-of-field-with-quotes () (with-temp-buffer (csv-mode) (insert "aaa,\"b,b\"") (csv-beginning-of-field) - (should (equal (buffer-substring (point) (point-max)) - "\"b,b\"")) + (should (equal (buffer-substring (point) (point-max)) "\"b,b\"")) (backward-char) (csv-beginning-of-field) (should (equal (buffer-substring (point) (point-max)) "aaa,\"b,b\"")))) -(defun csv-mode-tests--align-fields (before after) +(defun csv-tests--align-fields (before after) (with-temp-buffer (insert (string-join before "\n")) (csv-align-fields t (point-min) (point-max)) (should (equal (buffer-string) (string-join after "\n"))))) -(ert-deftest csv-mode-tests-align-fields () - (csv-mode-tests--align-fields +(ert-deftest csv-tests-align-fields () + (csv-tests--align-fields '("aaa,bbb,ccc" "1,2,3") '("aaa, bbb, ccc" "1 , 2 , 3"))) -(ert-deftest csv-mode-tests-align-fields-with-quotes () - (csv-mode-tests--align-fields +(ert-deftest csv-tests-align-fields-with-quotes () + (csv-tests--align-fields '("aaa,\"b,b\",ccc" "1,2,3") '("aaa, \"b,b\", ccc" "1 , 2 , 3"))) ;; Bug#14053 -(ert-deftest csv-mode-tests-align-fields-double-quote-comma () - (csv-mode-tests--align-fields +(ert-deftest csv-tests-align-fields-double-quote-comma () + (csv-tests--align-fields '("1,2,3" "a,\"b\"\"c,\",d") '("1, 2 , 3" "a, \"b\"\"c,\", d"))) +(defvar csv-tests--data + "1,4;Sun, 2022-04-10;4,12 +8;Mon, 2022-04-11;3,19 +3,2;Tue, 2022-04-12;1,00 +2;Wed, 2022-04-13;0,37 +9;Wed, 2022-04-13;0,37") + +(ert-deftest csv-tests-guess-separator () + (should-not (csv-guess-separator "")) + (should (= (csv-guess-separator csv-tests--data 3) ?,)) + (should (= (csv-guess-separator csv-tests--data) ?\;)) + (should (= (csv-guess-separator csv-tests--data) + (csv-guess-separator csv-tests--data + (length csv-tests--data))))) + +(ert-deftest csv-tests-separator-candidates () + (should-not (csv--separator-candidates "")) + (should-not (csv--separator-candidates csv-tests--data 0)) + (should + (equal (sort (csv--separator-candidates csv-tests--data 4) #'<) + '(?, ?\;))) + (should + (equal (sort (csv--separator-candidates csv-tests--data) #'<) + '(?\s ?, ?- ?\;))) + (should + (equal + (sort (csv--separator-candidates csv-tests--data) #'<) + (sort (csv--separator-candidates csv-tests--data + (length csv-tests--data)) + #'<)))) + +(ert-deftest csv-tests-separator-score () + (should (< (csv--separator-score ?, csv-tests--data) + (csv--separator-score ?\s csv-tests--data) + (csv--separator-score ?- csv-tests--data))) + (should (= (csv--separator-score ?- csv-tests--data) + (csv--separator-score ?\; csv-tests--data))) + (should (= 0 (csv--separator-score ?\; csv-tests--data 0))) + (should (= (csv--separator-score ?\; csv-tests--data) + (csv--separator-score ?\; csv-tests--data + (length csv-tests--data))))) + (provide 'csv-mode-tests) ;;; csv-mode-tests.el ends here diff --git a/csv-mode.el b/csv-mode.el index 10ce166052..b2a881dde2 100644 --- a/csv-mode.el +++ b/csv-mode.el @@ -1,11 +1,11 @@ ;;; csv-mode.el --- Major mode for editing comma/char separated values -*- lexical-binding: t -*- -;; Copyright (C) 2003, 2004, 2012-2020 Free Software Foundation, Inc +;; Copyright (C) 2003, 2004, 2012-2022 Free Software Foundation, Inc ;; Author: "Francis J. Wright" ;; Maintainer: emacs-devel@gnu.org ;; Version: 1.19 -;; Package-Requires: ((emacs "24.1") (cl-lib "0.5")) +;; Package-Requires: ((emacs "27.1") (cl-lib "0.5")) ;; Keywords: convenience ;; This package is free software; you can redistribute it and/or modify @@ -119,7 +119,9 @@ ;;; Code: -(eval-when-compile (require 'cl-lib)) +(eval-when-compile + (require 'cl-lib) + (require 'subr-x)) (defgroup CSV nil "Major mode for editing files of comma-separated value type." @@ -163,12 +165,14 @@ session. Use `customize-set-variable' instead if that is required." (error "%S is already a quote" x))) value) (custom-set-default variable value) - (setq csv-separator-chars (mapcar #'string-to-char value) - csv--skip-chars (apply #'concat "^\n" csv-separators) - csv-separator-regexp (apply #'concat `("[" ,@value "]")) - csv-font-lock-keywords - ;; NB: csv-separator-face variable evaluates to itself. - `((,csv-separator-regexp (0 'csv-separator-face)))))) + (setq csv-separator-chars (mapcar #'string-to-char value)) + (setq csv--skip-chars + (apply #'concat "^\n" + (mapcar (lambda (s) (concat "\\" s)) value))) + (setq csv-separator-regexp (regexp-opt value)) + (setq csv-font-lock-keywords + ;; NB: csv-separator-face variable evaluates to itself. + `((,csv-separator-regexp (0 'csv-separator-face)))))) (defcustom csv-field-quotes '("\"") "Field quotes: a list of *single-character* strings. @@ -368,6 +372,23 @@ It must be either a string or nil." (modify-syntax-entry ?\n ">" csv-mode-syntax-table)) (setq csv-comment-start string)) +(defvar csv--set-separator-history nil) + +(defun csv-set-separator (sep) + "Set the CSV separator in the current buffer to SEP." + (interactive (list (read-char-from-minibuffer + "Separator: " nil 'csv--set-separator-history))) + (when (and (boundp 'csv-field-quotes) + (member (string sep) csv-field-quotes)) + (error "%c is already a quote" sep)) + (setq-local csv-separators (list (string sep))) + (setq-local csv-separator-chars (list sep)) + (setq-local csv--skip-chars (format "^\n\\%c" sep)) + (setq-local csv-separator-regexp (regexp-quote (string sep))) + (setq-local csv-font-lock-keywords + `((,csv-separator-regexp (0 'csv-separator-face)))) + (font-lock-refresh-defaults)) + ;;;###autoload (add-to-list 'auto-mode-alist '("\\.[Cc][Ss][Vv]\\'" . csv-mode)) @@ -1728,6 +1749,104 @@ setting works better)." (jit-lock-unregister #'csv--jit-align) (csv--jit-unalign (point-min) (point-max)))) (csv--header-flush)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Separator guessing +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defvar csv--preferred-separators + '(?\t ?\s ?, ?: ?\;) + "Preferred separator characters in case of a tied score.") + +(defun csv-guess-set-separator () + "Guess and set the CSV separator of the current buffer. + +Add it to the mode hook to have CSV mode guess and set the +separator automatically when visiting a buffer: + + (add-hook \\='csv-mode-hook \\='csv-guess-set-separator)" + (interactive) + (let ((sep (csv-guess-separator + (buffer-substring-no-properties + (point-min) + ;; We're probably only going to look at the first 2048 + ;; or so chars, but take more than we probably need to + ;; minimize the chance of breaking the input in the + ;; middle of a (long) row. + (min 8192 (point-max))) + 2048))) + (when sep + (csv-set-separator sep)))) + +(defun csv-guess-separator (text &optional cutoff) + "Return a guess of which character is the CSV separator in TEXT." + (let ((best-separator nil) + (best-score 0)) + (dolist (candidate (csv--separator-candidates text cutoff)) + (let ((candidate-score + (csv--separator-score candidate text cutoff))) + (when (or (> candidate-score best-score) + (and (= candidate-score best-score) + (member candidate csv--preferred-separators))) + (setq best-separator candidate) + (setq best-score candidate-score)))) + best-separator)) + +(defun csv--separator-candidates (text &optional cutoff) + "Return a list of candidate CSV separators in TEXT. +When CUTOFF is passed, look only at the first CUTOFF number of characters." + (let ((chars (make-hash-table))) + (dolist (c (string-to-list + (if cutoff + (substring text 0 (min cutoff (length text))) + text))) + (when (and (not (gethash c chars)) + (or (= c ?\t) + (and (not (member c '(?. ?/ ?\" ?'))) + (not (member (get-char-code-property c 'general-category) + '(Lu Ll Lt Lm Lo Nd Nl No Ps Pe Cc Co)))))) + (puthash c t chars))) + (hash-table-keys chars))) + +(defun csv--separator-score (separator text &optional cutoff) + "Return a score on how likely SEPARATOR is a separator in TEXT. + +When CUTOFF is passed, stop the calculation at the next whole +line after having read CUTOFF number of characters. + +The scoring is based on the idea that most CSV data is tabular, +i.e. separators should appear equally often on each line. +Furthermore, more commonly appearing characters are scored higher +than those who appear less often. + +Adapted from the paper \"Wrangling Messy CSV Files by Detecting +Row and Type Patterns\" by Gerrit J.J. van den Burg , Alfredo +Nazábal, and Charles Sutton: https://arxiv.org/abs/1811.11242." + (let ((groups + (with-temp-buffer + (csv-set-separator separator) + (save-excursion + (insert text)) + (let ((groups (make-hash-table)) + (chars-read 0)) + (while (and (/= (point) (point-max)) + (or (not cutoff) + (< chars-read cutoff))) + (let* ((lep (line-end-position)) + (nfields (length (csv--collect-fields lep)))) + (cl-incf (gethash nfields groups 0)) + (cl-incf chars-read (- lep (point))) + (goto-char (+ lep 1)))) + groups))) + (sum 0)) + (maphash + (lambda (length num) + (cl-incf sum (* num (/ (- length 1) (float length))))) + groups) + (let ((unique-groups (hash-table-count groups))) + (if (= 0 unique-groups) + 0 + (/ sum unique-groups))))) ;;; TSV support -- 2.35.1