[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemacs-commit] qemacs buffer.c qe.c qe.h
From: |
Charlie Gordon |
Subject: |
[Qemacs-commit] qemacs buffer.c qe.c qe.h |
Date: |
Sun, 16 Aug 2015 23:15:02 +0000 |
CVSROOT: /sources/qemacs
Module name: qemacs
Changes by: Charlie Gordon <chqrlie> 15/08/16 23:15:02
Modified files:
. : buffer.c qe.c qe.h
Log message:
search: another 35% speed improvement on large files
- improve utf8 handling in eb_prevc() using eb_read_one_byte()
- added utf8_is_trailing_byte(b) to standardize testing for this
- improve hex searching by 50% using eb_read_one_byte()
- improve regular searching by 35% by removing extra calls to eb_nextc()
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/buffer.c?cvsroot=qemacs&r1=1.85&r2=1.86
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.c?cvsroot=qemacs&r1=1.200&r2=1.201
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.h?cvsroot=qemacs&r1=1.197&r2=1.198
Patches:
Index: buffer.c
===================================================================
RCS file: /sources/qemacs/qemacs/buffer.c,v
retrieving revision 1.85
retrieving revision 1.86
diff -u -b -r1.85 -r1.86
--- buffer.c 16 Aug 2015 19:22:01 -0000 1.85
+++ buffer.c 16 Aug 2015 23:15:01 -0000 1.86
@@ -1214,6 +1214,7 @@
if (ch == ESCAPE_CHAR) {
eb_read(b, offset - 1, buf, MAX_CHAR_BYTES);
b->charset_state.p = buf;
+ /* XXX: incorrect behaviour on ill encoded utf8 sequences */
ch = b->charset_state.decode_func(&b->charset_state);
offset += (b->charset_state.p - buf) - 1;
}
@@ -1297,27 +1298,36 @@
offset = 0;
ch = '\n';
} else {
- /* XXX: it cannot be generic here. Should use the
- line/column system to be really generic */
- char_size = b->charset_state.char_size;
- offset -= char_size;
- q = buf + sizeof(buf) - char_size;
- eb_read(b, offset, q, char_size);
if (b->charset == &charset_utf8) {
- while (*q >= 0x80 && *q < 0xc0) {
- if (offset == 0 || q == buf) {
- /* error: take only previous byte */
- offset += buf - 1 - q;
+ char_size = 1;
+ offset -= 1;
+ ch = eb_read_one_byte(b, offset);
+ if (utf8_is_trailing_byte(ch)) {
+ int offset1 = offset;
+ q = buf + sizeof(buf);
+ *--q = ch;
+ while (utf8_is_trailing_byte(ch) && offset > 0 && q > buf) {
+ offset -= 1;
+ *--q = ch = eb_read_one_byte(b, offset);
+ }
+ if (ch >= 0xc0) {
+ ch = utf8_decode((const char **)(void *)&q);
+ }
+ if (q != buf + sizeof(buf)) {
+ /* decoding error: only take the last byte */
+ offset = offset1;
ch = buf[sizeof(buf) - 1];
- goto the_end;
}
- offset--;
- q--;
- eb_read(b, offset, q, 1);
}
- ch = utf8_decode((const char **)(void *)&q);
} else {
- /* CG: this only works for stateless charsets */
+ /* XXX: this only works for stateless charsets.
+ * it would fail for utf-16 and east-asian encodings.
+ * Should use the line/column system to be really generic
+ */
+ char_size = b->charset_state.char_size;
+ offset -= char_size;
+ q = buf + sizeof(buf) - char_size;
+ eb_read(b, offset, q, char_size);
b->charset_state.p = q;
ch = b->charset_state.decode_func(&b->charset_state);
}
@@ -1340,7 +1350,6 @@
}
}
}
- the_end:
*prev_ptr = offset;
return ch;
}
@@ -1482,7 +1491,7 @@
/* Round offset down to character boundary */
u8 buf[1];
while (offset > 0 && eb_read(b, offset, buf, 1) == 1 &&
- (buf[0] & 0xC0) == 0x80) {
+ utf8_is_trailing_byte(buf[0])) {
/* backtrack over trailing bytes */
offset--;
}
Index: qe.c
===================================================================
RCS file: /sources/qemacs/qemacs/qe.c,v
retrieving revision 1.200
retrieving revision 1.201
diff -u -b -r1.200 -r1.201
--- qe.c 16 Aug 2015 18:01:06 -0000 1.200
+++ qe.c 16 Aug 2015 23:15:01 -0000 1.201
@@ -2251,9 +2251,8 @@
{
char buf[256];
buf_t outbuf, *out;
- unsigned char cc;
int line_num, col_num;
- int c, c2, offset1, offset2, off;
+ int c, c2, cc, offset1, offset2, off;
out = buf_init(&outbuf, buf, sizeof(buf));
if (s->offset < s->b->total_size) {
@@ -2295,11 +2294,11 @@
/* Display buffer bytes if char is encoded */
off = s->offset;
- eb_read(s->b, off++, &cc, 1);
+ cc = eb_read_one_byte(s->b, off++);
if (cc != c || c2 || off != offset2) {
buf_printf(out, " [%02X", cc);
while (off < offset2) {
- eb_read(s->b, off++, &cc, 1);
+ cc = eb_read_one_byte(s->b, off++);
buf_printf(out, " %02X", cc);
}
buf_put_byte(out, ']');
@@ -6472,7 +6471,7 @@
int *found_offset, int *found_end)
{
int total_size = b->total_size;
- int c, c2, offset = start_offset, offset1, offset2, pos;
+ int c, c2, offset = start_offset, offset1, offset2, offset3, pos;
if (len == 0)
return 0;
@@ -6504,19 +6503,16 @@
if (offset >= total_size)
return 0;
- if ((offset & 0x1ffff) == 0) {
- /* check for search abort every 128k */
+ if ((offset & 0xfffff) == 0) {
+ /* check for search abort every megabyte */
if (abort_func && abort_func(abort_opaque))
return -1;
}
pos = 0;
for (offset2 = offset; offset2 < total_size;) {
- u8 data[1];
-
/* CG: Should bufferize a bit ? */
- eb_read(b, offset2++, data, 1);
- c = data[0];
+ c = eb_read_one_byte(b, offset2++);
c2 = buf[pos++];
if (c != c2)
break;
@@ -6530,33 +6526,35 @@
}
}
- for (;; (void)(dir >= 0 && eb_nextc(b, offset, &offset))) {
+ for (offset1 = offset;;) {
if (dir < 0) {
if (offset == 0)
return 0;
eb_prevc(b, offset, &offset);
- }
+ } else {
+ offset = offset1;
if (offset >= total_size)
return 0;
-
- if ((offset & 0x1ffff) == 0) {
- /* check for search abort every 128k */
+ }
+ if ((offset & 0xfffff) == 0) {
+ /* check for search abort every megabyte */
if (abort_func && abort_func(abort_opaque))
return -1;
}
if (flags & SEARCH_FLAG_WORD) {
/* check for start of word */
- c = eb_prevc(b, offset, &offset1);
+ c = eb_prevc(b, offset, &offset3);
if (qe_isword(c))
continue;
}
- pos = 0;
- offset2 = offset;
- while (offset2 < total_size) {
/* CG: XXX: Should use buffer specific accelerator */
- c = eb_nextc(b, offset2, &offset2);
+ /* Get first char separately to compute offset1 */
+ c = eb_nextc(b, offset, &offset1);
+
+ pos = 0;
+ for (offset2 = offset1;;) {
c2 = buf[pos++];
if (flags & SEARCH_FLAG_IGNORECASE) {
if (qe_toupper(c) != qe_toupper(c2))
@@ -6568,7 +6566,7 @@
if (pos >= len) {
if (flags & SEARCH_FLAG_WORD) {
/* check for end of word */
- c = eb_nextc(b, offset2, &offset1);
+ c = eb_nextc(b, offset2, &offset3);
if (qe_isword(c))
break;
}
@@ -6578,6 +6576,9 @@
return 1;
}
}
+ if (offset2 >= total_size)
+ break;
+ c = eb_nextc(b, offset2, &offset2);
}
}
}
Index: qe.h
===================================================================
RCS file: /sources/qemacs/qemacs/qe.h,v
retrieving revision 1.197
retrieving revision 1.198
diff -u -b -r1.197 -r1.198
--- qe.h 16 Aug 2015 19:22:01 -0000 1.197
+++ qe.h 16 Aug 2015 23:15:02 -0000 1.198
@@ -601,6 +601,7 @@
void qe_register_charset(QECharset *charset);
extern unsigned char utf8_length[256];
+static inline int utf8_is_trailing_byte(int c) { return (c & 0xC0) == 0x80; }
int utf8_encode(char *q, int c);
int utf8_decode(const char **pp);
int utf8_to_unicode(unsigned int *dest, int dest_length, const char *str);
- [Qemacs-commit] qemacs buffer.c qe.c qe.h,
Charlie Gordon <=