bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Split -b Question


From: Jim Meyering
Subject: Re: Split -b Question
Date: Wed, 09 Apr 2003 16:46:21 +0200

Paul Eggert <address@hidden> wrote:
> Luke Hassell <address@hidden> writes:
>
>> I must have missed something obvious.

Thank you for reporting that, Luke.

> No, you haven't.  split doesn't support large files in coreutils 5.0;
> this has been a FIXME for a while, I think.  Here's a patch.
>
> 2003-04-08  Paul Eggert  <address@hidden>
>
>       * src/split.c: Add support for large files.
>         Include "inttostr.h".
>         (bytes_split, lines_split, line_bytes_split, main):
>         Use uintmax_t, not size_t, for file sizes.
>         Use umaxtostr to print file sizes.

Thanks, Paul!
I'd already done most of that.
Here's what I have now:

        * src/split.c (bytes_split): Use size_t temporary (rather than
        uintmax_t original) in remaining computations.  From Paul Eggert.

        Handle command line option arguments larger than 2^31.
        This allows e.g., splitting into files of size 2GB and larger,
        and running split --lines=N with N=2^31 or more.
        But for --line-bytes=N, the restriction that N <= SIZE_MAX
        remains (for now), due to the way it is implemented.

        * src/split.c: Include "inttostr.h".
        (bytes_split, lines_split, line_bytes_split, main):
        Use uintmax_t, not size_t, for file sizes.
        (main): Give a better diagnostic for option arguments == 0.
        Use umaxtostr to print file sizes.
        Reported by Luke Hassell.

        * tests/misc/Makefile.am (TESTS): Add split-fail.
        * tests/misc/split-fail: New file.

        * src/split.c: Rename local variables: nchars -> n_bytes.
        (lines_split): Rename local, nlines -> n_lines.
        (main): Rename local variable: s/accum/n_units/.
        (main): Use STDIN_FILENO, not literal `0'.

Index: split.c
===================================================================
RCS file: /fetish/cu/src/split.c,v
retrieving revision 1.78
retrieving revision 1.85
diff -u -p -u -p -r1.78 -r1.85
--- split.c     11 Mar 2003 20:48:36 -0000      1.78
+++ split.c     9 Apr 2003 14:40:05 -0000       1.85
@@ -33,6 +33,7 @@
 #include "error.h"
 #include "full-read.h"
 #include "full-write.h"
+#include "inttostr.h"
 #include "posixver.h"
 #include "safe-read.h"
 #include "xstrtol.h"
@@ -196,16 +197,16 @@ cwrite (int new_file_flag, const char *b
     error (EXIT_FAILURE, errno, "%s", outfile);
 }
 
-/* Split into pieces of exactly NCHARS bytes.
+/* Split into pieces of exactly N_BYTES bytes.
    Use buffer BUF, whose size is BUFSIZE.  */
 
 static void
-bytes_split (size_t nchars, char *buf, size_t bufsize)
+bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
 {
   size_t n_read;
   int new_file_flag = 1;
   size_t to_read;
-  size_t to_write = nchars;
+  uintmax_t to_write = n_bytes;
   char *bp_out;
 
   do
@@ -227,27 +228,30 @@ bytes_split (size_t nchars, char *buf, s
                }
              break;
            }
-
-         cwrite (new_file_flag, bp_out, to_write);
-         bp_out += to_write;
-         to_read -= to_write;
-         new_file_flag = 1;
-         to_write = nchars;
+         else
+           {
+             size_t w = to_write;
+             cwrite (new_file_flag, bp_out, w);
+             bp_out += w;
+             to_read -= w;
+             new_file_flag = 1;
+             to_write = n_bytes;
+           }
        }
     }
   while (n_read == bufsize);
 }
 
-/* Split into pieces of exactly NLINES lines.
+/* Split into pieces of exactly N_LINES lines.
    Use buffer BUF, whose size is BUFSIZE.  */
 
 static void
-lines_split (size_t nlines, char *buf, size_t bufsize)
+lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
 {
   size_t n_read;
   char *bp, *bp_out, *eob;
   int new_file_flag = 1;
-  size_t n = 0;
+  uintmax_t n = 0;
 
   do
     {
@@ -272,7 +276,7 @@ lines_split (size_t nlines, char *buf, s
            }
 
          ++bp;
-         if (++n >= nlines)
+         if (++n >= n_lines)
            {
              cwrite (new_file_flag, bp_out, bp - bp_out);
              bp_out = bp;
@@ -285,33 +289,36 @@ lines_split (size_t nlines, char *buf, s
 }
 
 /* Split into pieces that are as large as possible while still not more
-   than NCHARS bytes, and are split on line boundaries except
-   where lines longer than NCHARS bytes occur. */
+   than N_BYTES bytes, and are split on line boundaries except
+   where lines longer than N_BYTES bytes occur.
+   FIXME: don't require a buffer of size N_BYTES, in case N_BYTES
+   is very large.  */
 
 static void
-line_bytes_split (size_t nchars)
+line_bytes_split (uintmax_t n_bytes)
 {
   size_t n_read;
   char *bp;
   int eof = 0;
   size_t n_buffered = 0;
-  char *buf = (char *) xmalloc (nchars);
+  size_t n = n_bytes;
+  char *buf = (char *) xmalloc (n);
 
   do
     {
       /* Fill up the full buffer size from the input file.  */
 
-      n_read = full_read (input_desc, buf + n_buffered, nchars - n_buffered);
+      n_read = full_read (input_desc, buf + n_buffered, n_bytes - n_buffered);
       if (n_read == SAFE_READ_ERROR)
        error (EXIT_FAILURE, errno, "%s", infile);
 
       n_buffered += n_read;
-      if (n_buffered != nchars)
+      if (n_buffered != n_bytes)
        eof = 1;
 
       /* Find where to end this chunk.  */
       bp = buf + n_buffered;
-      if (n_buffered == nchars)
+      if (n_buffered == n_bytes)
        {
          while (bp > buf && bp[-1] != '\n')
            bp--;
@@ -347,14 +354,13 @@ int
 main (int argc, char **argv)
 {
   struct stat stat_buf;
-  size_t num;                  /* numeric argument from command line */
   enum
     {
       type_undef, type_bytes, type_byteslines, type_lines, type_digits
     } split_type = type_undef;
   size_t in_blk_size;          /* optimal block size of input file device */
   char *buf;                   /* file i/o buffer */
-  size_t accum = 0;
+  uintmax_t n_units;
   int c;
   int digits_optind = 0;
 
@@ -374,7 +380,6 @@ main (int argc, char **argv)
     {
       /* This is the argv-index of the option we will read next.  */
       int this_optind = optind ? optind : 1;
-      long int tmp_long;
 
       c = getopt_long (argc, argv, "0123456789C:a:b:l:", longopts, NULL);
       if (c == -1)
@@ -389,7 +394,7 @@ main (int argc, char **argv)
          {
            unsigned long tmp;
            if (xstrtoul (optarg, NULL, 10, &tmp, "") != LONGINT_OK
-               || SIZE_MAX < tmp)
+               || tmp == 0 || SIZE_MAX < tmp)
              {
                error (0, 0, _("%s: invalid suffix length"), optarg);
                usage (EXIT_FAILURE);
@@ -402,39 +407,36 @@ main (int argc, char **argv)
          if (split_type != type_undef)
            FAIL_ONLY_ONE_WAY ();
          split_type = type_bytes;
-         if (xstrtol (optarg, NULL, 10, &tmp_long, "bkm") != LONGINT_OK
-             || tmp_long < 0 || tmp_long > INT_MAX)
+         if (xstrtoumax (optarg, NULL, 10, &n_units, "bkm") != LONGINT_OK
+             || n_units == 0)
            {
              error (0, 0, _("%s: invalid number of bytes"), optarg);
              usage (EXIT_FAILURE);
            }
-         accum = /* FIXME: */ (int) tmp_long;
          break;
 
        case 'l':
          if (split_type != type_undef)
            FAIL_ONLY_ONE_WAY ();
          split_type = type_lines;
-         if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
-             || tmp_long < 0 || tmp_long > INT_MAX)
+         if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK
+             || n_units == 0)
            {
              error (0, 0, _("%s: invalid number of lines"), optarg);
              usage (EXIT_FAILURE);
            }
-         accum = /* FIXME */ (int) tmp_long;
          break;
 
        case 'C':
          if (split_type != type_undef)
            FAIL_ONLY_ONE_WAY ();
          split_type = type_byteslines;
-         if (xstrtol (optarg, NULL, 10, &tmp_long, "bkm") != LONGINT_OK
-             || tmp_long < 0 ||  tmp_long > INT_MAX)
+         if (xstrtoumax (optarg, NULL, 10, &n_units, "bkm") != LONGINT_OK
+             || n_units == 0 || SIZE_MAX < n_units)
            {
              error (0, 0, _("%s: invalid number of bytes"), optarg);
              usage (EXIT_FAILURE);
            }
-         accum = /* FIXME */ (int) tmp_long;
          break;
 
        case '0':
@@ -447,13 +449,18 @@ main (int argc, char **argv)
        case '7':
        case '8':
        case '9':
+         if (split_type == type_undef)
+           {
+             split_type = type_digits;
+             n_units = 0;
+           }
          if (split_type != type_undef && split_type != type_digits)
            FAIL_ONLY_ONE_WAY ();
          if (digits_optind != 0 && digits_optind != this_optind)
-           accum = 0;          /* More than one number given; ignore other. */
+           n_units = 0;        /* More than one number given; ignore other. */
          digits_optind = this_optind;
-         split_type = type_digits;
-         accum = accum * 10 + c - '0';
+         n_units = n_units * 10 + c - '0';
+         /* FIXME: detect overflow, or remove this support altogether */
          break;
 
        case_GETOPT_HELP_CHAR;
@@ -467,7 +474,9 @@ main (int argc, char **argv)
 
   if (digits_optind && 200112 <= posix2_version ())
     {
-      error (0, 0, _("`-%d' option is obsolete; use `-l %d'"), accum, accum);
+      char buffer[INT_BUFSIZE_BOUND (uintmax_t)];
+      char const *a = umaxtostr (n_units, buffer);
+      error (0, 0, _("`-%s' option is obsolete; use `-l %s'"), a, a);
       usage (EXIT_FAILURE);
     }
 
@@ -475,15 +484,16 @@ main (int argc, char **argv)
   if (split_type == type_undef)
     {
       split_type = type_lines;
-      accum = 1000;
+      n_units = 1000;
     }
 
-  if (accum < 1)
+  if (n_units == 0)
     {
-      error (0, 0, _("invalid number"));
+      /* FIXME: be sure to remove this block when removing
+        support for obsolete options like `-10'.  */
+      error (0, 0, _("invalid number of lines: 0"));
       usage (EXIT_FAILURE);
     }
-  num = accum;
 
   /* Get out the filename arguments.  */
 
@@ -501,7 +511,7 @@ main (int argc, char **argv)
 
   /* Open the input file.  */
   if (STREQ (infile, "-"))
-    input_desc = 0;
+    input_desc = STDIN_FILENO;
   else
     {
       input_desc = open (infile, O_RDONLY);
@@ -526,15 +536,15 @@ main (int argc, char **argv)
     {
     case type_digits:
     case type_lines:
-      lines_split (num, buf, in_blk_size);
+      lines_split (n_units, buf, in_blk_size);
       break;
 
     case type_bytes:
-      bytes_split (num, buf, in_blk_size);
+      bytes_split (n_units, buf, in_blk_size);
       break;
 
     case type_byteslines:
-      line_bytes_split (num);
+      line_bytes_split (n_units);
       break;
 
     default:




reply via email to

[Prev in Thread] Current Thread [Next in Thread]