Replies: 7 comments 5 replies
-
Are you trying to count certain symbols? |
Beta Was this translation helpful? Give feedback.
-
If you just want to count symbols, this is the best I can offer you now: https://godbolt.org/z/1rejYz7rM |
Beta Was this translation helpful? Give feedback.
-
obviously I want to get the positions and types of the block elements. Can I compare this to home? Consider a simple structure that will take the values blockType, posStart, *posEnd. And then it should be append into std::vector . |
Beta Was this translation helpful? Give feedback.
-
#pragma once
#include <cassert>
#include <cstring>
#include <deque>
#include <eve/eve.hpp>
#include <vector>
#include "helper.hpp"
// #include "src/PerformanceLogger.hpp"
namespace tokenizer {
enum class TokenType {
kNull,
kTrue,
kFalse,
kNumber,
kString,
kArray,
kArrayEnd,
kObject,
kObjectEnd,
kComma,
kColon,
kError,
};
using wide_u8 = eve::wide<std::uint8_t, eve::fixed<8>>;
using wide_u16 = eve::wide<std::uint8_t, eve::fixed<16>>;
static const wide_u8 kSpaceCharList{' ', '\t', '\n', '\r',
'\f', '\b', ':', ','};
EVE_FORCEINLINE bool isWhiteSpace(const std::uint8_t& c) {
// wide_u16 mask{c};
return eve::any(kSpaceCharList == c);
}
/*
JSON String Escape Table
*/
static const wide_u8 kEscapeTable{' ', '\t', '\n', '\r',
'\f', '\b', '\v', '\a'};
EVE_FORCEINLINE bool isEscape(const std::uint8_t& c) {
wide_u8 fill_c{c};
// eve::wide<std::uint8_t> mask{c};
return eve::any(kEscapeTable == c);
};
// JSON Number Table
/*
'0': 0x30, '1': 0x31, '2': 0x32, '3': 0x33, '4': 0x34, '5': 0x35, '6': 0x36,
'7': 0x37, '8': 0x38, '9': 0x39,
'-': 0x2d, '+': 0x2b, '.': 0x2e, 'e': 0x65, 'E': 0x45,
total count is 15
*/
static const wide_u16 kNumericTable{0x30, 0x31, 0x32, 0x33, 0x34, 0x35,
0x36, 0x37, 0x38, 0x39, 0x2d, 0x2b,
0x2e, 0x65, 0x45, 0x00};
/*
Numeric Table
*/
EVE_FORCEINLINE bool isNumeric(const std::uint8_t& c) {
// wide_u16 fill_c{c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
return eve::any(kNumericTable == c);
}
struct Token {
Token(TokenType type, int from) : type(type), from(from) {}
Token(TokenType type, int from, int* to) : type(type), from(from), to(to) {}
TokenType type;
int from;
int* to = nullptr;
};
// number,string,true,false and null position state
struct State {
TokenType type;
int from;
int to;
};
// static constexpr uint32_t kFalseBin = 0x65736c61;
// static constexpr uint32_t kTrueBin = 0x65757274;
// static constexpr uint32_t kNullBin = 0x6c6c756e;
static auto& strBitset = StringBitset::getInstance();
const EVE_FORCEINLINE std::vector<Token>* parse(std::uint8_t* data,
std::size_t size) {
auto* tokenList = new std::vector<Token>();
// tokenList->reserve(4096);
bool isString = false;
bool isNumber = false;
int advance = 0;
std::size_t i = 0;
std::vector<std::size_t> blockList;
for (; i < size; i++) {
// if (advance > 0) {
// advance -= 1;
// continue;
// }
// if (isNumber) {
// // auto c = data[i];
// // if ((c >= 0x30 && c <= 0x39) || c == 0x2e || c == 0x2d || c == 0x2b ||
// // c == 0x45 || c == 0x65) {
// if (strBitset.hasNumeric(data[i])) {
// continue;
// }
// isNumber = false;
// // tokenList->emplace_back(TokenType::kNumber, primitivePosition,
// // new int(i - primitivePosition));
// // primitivePosition = 0;
// // Console.timeEnd("Number Parsing");
// continue;
// }
// if (isString) {
// if (data[i] == '"') {
// isString = false;
// // Console.timeEnd("String Parsing");
// continue;
// }
// if (strBitset.hasEscape(data[i])) {
// continue;
// }
// continue;
// }
// SIMD vektöründe beyaz boşluk kontrolü yapılıyor
if (strBitset.hasWhitespace(data[i])) {
// current = data[++i];
continue;
}
switch (data[i]) {
case '{':
// Console.time("Object Start");
tokenList->emplace_back(TokenType::kObject, i);
// Console.timeEnd("Object Start");
break;
case '}':
// Console.time("Object End");
tokenList->emplace_back(TokenType::kObjectEnd, i);
// Console.timeEnd("Object End");
break;
case '[':
// Console.time("Array Start");
tokenList->emplace_back(TokenType::kArray, i);
// Console.timeEnd("Array Start");
break;
case ']':
// Console.time("Array End");
tokenList->emplace_back(TokenType::kArrayEnd, i);
// Console.timeEnd("Array End");
break;
case '"':
// Console.time("String Parsing");
// isString = !isString;
// primitivePosition = i + 1;
while (data[++i] != '"') {
if (strBitset.hasEscape(data[i])) {
i += 1;
}
}
// tokenList->emplace_back(TokenType::kString, primitivePosition,
// new int(i - primitivePosition - 1));
// primitivePosition = 0;
break;
case 't':
tokenList->emplace_back(TokenType::kTrue, i);
i += 3;
break;
case 'f':
tokenList->emplace_back(TokenType::kFalse, i);
i += 4;
break;
case 'n':
// get first 4 char
tokenList->emplace_back(TokenType::kNull, i);
i += 3;
break;
default:
// Console.time("Number Parsing");
if (strBitset.hasNumeric(data[i])) {
// isNumber = true;
// primitivePosition = i;
while (strBitset.hasNumeric(data[++i])) {
i += 1;
}
// tokenList->emplace_back(TokenType::kString, primitivePosition,
// new int(i - primitivePosition));
// primitivePosition = 0;
}
break;
}
}
// tokenList->shrink_to_fit();
return tokenList;
}
}; // namespace tokenizer This build is currently running at 600mb/s. I want to improve performance with parallel processes with Eve. |
Beta Was this translation helpful? Give feedback.
-
I don't think it's as obvious if you are outside of context. I would also suggest to see if maybe SIMD JSON is the solution you are looking for, it would be very difficult for you to compete with them |
Beta Was this translation helpful? Give feedback.
-
I will have a look at the weekend, there is a lot there. So far I'd say, best I can do for you would be to vectorise the splitting into substrings like this: https://github.com/jfalcou/eve/blob/main/examples/algorithms/writing_new/collect_indexes__complicated_real_example.cpp But I don't know if it would be faster then your switch. |
Beta Was this translation helpful? Give feedback.
-
After looking at this. a) I suspect this is fairly difficult. I'd suggest, using the example I gave you: See if this is faster then what you have |
Beta Was this translation helpful? Give feedback.
-
Beta Was this translation helpful? Give feedback.
All reactions