From c96b6987c42ed14644560e29ba19c33dc0ae0223 Mon Sep 17 00:00:00 2001 From: Itamar Date: Sat, 23 Jan 2021 16:47:20 +0200 Subject: [PATCH] LibCpp: Add the beginning of a C++ parser This parser will be used by the C++ langauge server to provide better auto-complete (& maybe also other things in the future). It is designed to be error tolerant, and keeps track of the position spans of the AST nodes, which should be useful later for incremental parsing. --- AK/ScopeLogger.h | 65 ++ Meta/CMake/all_the_debug_macros.cmake | 1 + Userland/Libraries/LibCpp/AST.cpp | 382 +++++++++ Userland/Libraries/LibCpp/AST.h | 585 +++++++++++++ Userland/Libraries/LibCpp/CMakeLists.txt | 2 + Userland/Libraries/LibCpp/Lexer.cpp | 22 +- Userland/Libraries/LibCpp/Lexer.h | 23 +- Userland/Libraries/LibCpp/Parser.cpp | 1000 ++++++++++++++++++++++ Userland/Libraries/LibCpp/Parser.h | 162 ++++ Userland/Utilities/CMakeLists.txt | 1 + Userland/Utilities/CppParserTest.cpp | 64 ++ 11 files changed, 2298 insertions(+), 9 deletions(-) create mode 100644 AK/ScopeLogger.h create mode 100644 Userland/Libraries/LibCpp/AST.cpp create mode 100644 Userland/Libraries/LibCpp/AST.h create mode 100644 Userland/Libraries/LibCpp/Parser.cpp create mode 100644 Userland/Libraries/LibCpp/Parser.h create mode 100644 Userland/Utilities/CppParserTest.cpp diff --git a/AK/ScopeLogger.h b/AK/ScopeLogger.h new file mode 100644 index 00000000000..748fbd11b2d --- /dev/null +++ b/AK/ScopeLogger.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2020, Denis Campredon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include + +#ifdef DEBUG_SPAM + +namespace AK { +class ScopeLogger { +public: + ScopeLogger(StringView&& fun) + : m_fun(fun) + { + StringBuilder sb; + + for (auto indent = m_depth++; indent > 0; indent--) + sb.append(' '); + dbgln("\033[1;{}m{}entering {}\033[0m", m_depth % 8 + 30, sb.to_string(), m_fun); + } + ~ScopeLogger() + { + StringBuilder sb; + + for (auto indent = --m_depth; indent > 0; indent--) + sb.append(' '); + dbgln("\033[1;{}m{}leaving {}\033[0m", (m_depth + 1) % 8 + 30, sb.to_string(), m_fun); + } + +private: + static inline size_t m_depth = 0; + StringView m_fun; +}; +} + +using AK::ScopeLogger; +# define SCOPE_LOGGER() auto tmp##__COUNTER__ = ScopeLogger(__PRETTY_FUNCTION__); + +#else +# define SCOPE_LOGGER() +#endif diff --git a/Meta/CMake/all_the_debug_macros.cmake b/Meta/CMake/all_the_debug_macros.cmake index 2e54cb1fc76..2d93233773b 100644 --- a/Meta/CMake/all_the_debug_macros.cmake +++ b/Meta/CMake/all_the_debug_macros.cmake @@ -159,6 +159,7 @@ set(UPDATE_COALESCING_DEBUG ON) set(VOLATILE_PAGE_RANGES_DEBUG ON) set(WSMESSAGELOOP_DEBUG ON) set(GPT_DEBUG ON) +set(CPP_DEBUG ON) # False positive: DEBUG is a flag but it works differently. # set(DEBUG ON) diff --git a/Userland/Libraries/LibCpp/AST.cpp b/Userland/Libraries/LibCpp/AST.cpp new file mode 100644 index 00000000000..a2b6db8cfe2 --- /dev/null +++ b/Userland/Libraries/LibCpp/AST.cpp @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2021, Itamar S. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "AST.h" +#include "AK/LogStream.h" + +namespace Cpp { + +static void print_indent(int indent) +{ + for (int i = 0; i < indent * 2; ++i) + dbgprintf(" "); +} + +void ASTNode::dump(size_t indent) const +{ + print_indent(indent); + dbgprintf("%s[%lu:%lu->%lu:%lu]\n", class_name(), start().line, start().column, end().line, end().column); +} + +void TranslationUnit::dump(size_t indent) const +{ + ASTNode::dump(indent); + for (const auto& child : m_children) { + child.dump(indent + 1); + } +} + +void FunctionDeclaration::dump(size_t indent) const +{ + ASTNode::dump(indent); + m_return_type->dump(indent + 1); + if (!m_name.is_null()) { + print_indent(indent + 1); + dbgprintf("%s\n", m_name.to_string().characters()); + } + print_indent(indent + 1); + dbgprintf("(\n"); + for (const auto& arg : m_parameters) { + arg.dump(indent + 1); + } + print_indent(indent + 1); + dbgprintf(")\n"); + if (!m_definition.is_null()) { + m_definition->dump(indent + 1); + } +} + +NonnullRefPtrVector FunctionDeclaration::declarations() const +{ + NonnullRefPtrVector declarations; + for (auto& arg : m_parameters) { + declarations.append(arg); + } + return declarations; +} + +void Type::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent + 1); + dbgprintf("%s\n", m_name.to_string().characters()); +} + +void Parameter::dump(size_t indent) const +{ + ASTNode::dump(indent); + if (!m_name.is_null()) { + print_indent(indent); + dbgprintf("%s\n", m_name.to_string().characters()); + } + m_type->dump(indent + 1); + // print_indent(indent); + // dbgprintf("%s [%s]\n", m_name.is_null() ? "" : m_name.to_string().characters(), m_type->name().to_string().characters()); +} + +void FunctionDefinition::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent); + dbgprintf("{\n"); + for (const auto& statement : m_statements) { + statement.dump(indent + 1); + } + print_indent(indent); + dbgprintf("}\n"); +} + +NonnullRefPtrVector FunctionDefinition::declarations() const +{ + NonnullRefPtrVector declarations; + for (auto& statement : m_statements) { + declarations.append(statement.declarations()); + } + return declarations; +} + +void VariableDeclaration::dump(size_t indent) const +{ + ASTNode::dump(indent); + m_type->dump(indent + 1); + print_indent(indent + 1); + dbgprintf("%s\n", m_name.to_string().characters()); + if (m_initial_value) + m_initial_value->dump(indent + 1); +} + +void Identifier::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent); + dbgprintf("%s\n", m_name.to_string().characters()); +} + +void NumericLiteral::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent); + dbgprintf("%s\n", m_value.to_string().characters()); +} + +void BinaryExpression::dump(size_t indent) const +{ + ASTNode::dump(indent); + + const char* op_string = nullptr; + switch (m_op) { + case BinaryOp::Addition: + op_string = "+"; + break; + case BinaryOp::Subtraction: + op_string = "-"; + break; + case BinaryOp::Multiplication: + op_string = "*"; + break; + case BinaryOp::Division: + op_string = "/"; + break; + case BinaryOp::Modulo: + op_string = "%"; + break; + case BinaryOp::GreaterThan: + op_string = ">"; + break; + case BinaryOp::GreaterThanEquals: + op_string = ">="; + break; + case BinaryOp::LessThan: + op_string = "<"; + break; + case BinaryOp::LessThanEquals: + op_string = "<="; + break; + case BinaryOp::BitwiseAnd: + op_string = "&"; + break; + case BinaryOp::BitwiseOr: + op_string = "|"; + break; + case BinaryOp::BitwiseXor: + op_string = "^"; + break; + case BinaryOp::LeftShift: + op_string = "<<"; + break; + case BinaryOp::RightShift: + op_string = ">>"; + break; + } + + m_lhs->dump(indent + 1); + print_indent(indent + 1); + ASSERT(op_string); + dbgprintf("%s\n", op_string); + m_rhs->dump(indent + 1); +} + +void AssignmentExpression::dump(size_t indent) const +{ + ASTNode::dump(indent); + + const char* op_string = nullptr; + switch (m_op) { + case AssignmentOp::Assignment: + op_string = "="; + break; + case AssignmentOp::AdditionAssignment: + op_string = "+="; + break; + case AssignmentOp::SubtractionAssignment: + op_string = "-="; + break; + } + + m_lhs->dump(indent + 1); + print_indent(indent + 1); + ASSERT(op_string); + dbgprintf("%s\n", op_string); + m_rhs->dump(indent + 1); +} + +void FunctionCall::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent); + dbgprintf("%s\n", m_name.to_string().characters()); + for (const auto& arg : m_arguments) { + arg.dump(indent + 1); + } +} + +void StringLiteral::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent + 1); + dbgprintf("%s\n", m_value.to_string().characters()); +} + +void ReturnStatement::dump(size_t indent) const +{ + ASTNode::dump(indent); + m_value->dump(indent + 1); +} + +void EnumDeclaration::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent); + dbgprintf("%s\n", m_name.to_string().characters()); + for (auto& entry : m_entries) { + print_indent(indent + 1); + dbgprintf("%s\n", entry.to_string().characters()); + } +} + +void StructOrClassDeclaration::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent); + dbgprintf("%s\n", m_name.to_string().characters()); + for (auto& member : m_members) { + member.dump(indent + 1); + } +} + +void MemberDeclaration::dump(size_t indent) const +{ + ASTNode::dump(indent); + m_type->dump(indent + 1); + print_indent(indent + 1); + dbgprintf("%s\n", m_name.to_string().characters()); + if (m_initial_value) { + m_initial_value->dump(indent + 2); + } +} + +void UnaryExpression::dump(size_t indent) const +{ + ASTNode::dump(indent); + + const char* op_string = nullptr; + switch (m_op) { + case UnaryOp::BitwiseNot: + op_string = "~"; + break; + case UnaryOp::Not: + op_string = "!"; + break; + case UnaryOp::Plus: + op_string = "+"; + break; + case UnaryOp::Minus: + op_string = "-"; + break; + case UnaryOp::PlusPlus: + op_string = "++"; + break; + default: + op_string = ""; + } + + ASSERT(op_string); + print_indent(indent + 1); + dbgprintf("%s\n", op_string); + m_lhs->dump(indent + 1); +} + +void BooleanLiteral::dump(size_t indent) const +{ + ASTNode::dump(indent); + print_indent(indent + 1); + dbgprintf("%s\n", m_value ? "true" : "false"); +} + +void Pointer::dump(size_t indent) const +{ + ASTNode::dump(indent); + if (!m_pointee.is_null()) { + m_pointee->dump(indent + 1); + } +} + +void MemberExpression::dump(size_t indent) const +{ + ASTNode::dump(indent); + m_object->dump(indent + 1); + m_property->dump(indent + 1); +} + +void BlockStatement::dump(size_t indent) const +{ + ASTNode::dump(indent); + for (auto& statement : m_statements) { + statement.dump(indent + 1); + } +} + +void ForStatement::dump(size_t indent) const +{ + ASTNode::dump(indent); + if (m_init) + m_init->dump(indent + 1); + if (m_test) + m_test->dump(indent + 1); + if (m_update) + m_update->dump(indent + 1); + if (m_body) + m_body->dump(indent + 1); +} + +NonnullRefPtrVector Statement::declarations() const +{ + if (is_declaration()) { + NonnullRefPtrVector vec; + const auto& decl = static_cast(*this); + vec.empend(const_cast(decl)); + return vec; + } + return {}; +} + +NonnullRefPtrVector ForStatement::declarations() const +{ + auto declarations = m_init->declarations(); + declarations.append(m_body->declarations()); + return declarations; +} + +NonnullRefPtrVector BlockStatement::declarations() const +{ + NonnullRefPtrVector declarations; + for (auto& statement : m_statements) { + declarations.append(statement.declarations()); + } + return declarations; +} + +} diff --git a/Userland/Libraries/LibCpp/AST.h b/Userland/Libraries/LibCpp/AST.h new file mode 100644 index 00000000000..5f31b0f2e4e --- /dev/null +++ b/Userland/Libraries/LibCpp/AST.h @@ -0,0 +1,585 @@ +/* + * Copyright (c) 2021, Itamar S. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "Applications/Piano/Music.h" +#include +#include +#include +#include +#include +#include + +namespace Cpp { + +class ASTNode; +class TranslationUnit; +class Declaration; +class FunctionDefinition; +class Type; +class Parameter; +class Statement; + +class ASTNode : public RefCounted { +public: + virtual ~ASTNode() = default; + virtual const char* class_name() const = 0; + virtual void dump(size_t indent) const; + + ASTNode* parent() const { return m_parent; } + Position start() const + { + ASSERT(m_start.has_value()); + return m_start.value(); + } + Position end() const + { + ASSERT(m_end.has_value()); + return m_end.value(); + } + void set_end(const Position& end) { m_end = end; } + void set_parent(ASTNode& parent) { m_parent = &parent; } + + virtual NonnullRefPtrVector declarations() const { return {}; } + + virtual bool is_identifier() const { return false; } + virtual bool is_member_expression() const { return false; } + virtual bool is_variable_or_parameter_declaration() const { return false; } + +protected: + ASTNode(ASTNode* parent, Optional start, Optional end) + : m_parent(parent) + , m_start(start) + , m_end(end) + { + } + +private: + ASTNode* m_parent { nullptr }; + Optional m_start; + Optional m_end; +}; + +class TranslationUnit : public ASTNode { + +public: + virtual ~TranslationUnit() override = default; + const NonnullRefPtrVector& children() const { return m_children; } + virtual const char* class_name() const override { return "TranslationUnit"; } + virtual void dump(size_t indent) const override; + void append(NonnullRefPtr child) + { + m_children.append(move(child)); + } + virtual NonnullRefPtrVector declarations() const override { return m_children; } + +public: + TranslationUnit(ASTNode* parent, Optional start, Optional end) + : ASTNode(parent, start, end) + { + } + +private: + NonnullRefPtrVector m_children; +}; + +class Statement : public ASTNode { +public: + virtual ~Statement() override = default; + virtual const char* class_name() const override { return "Statement"; } + + virtual bool is_declaration() const { return false; } + virtual NonnullRefPtrVector declarations() const override; + +protected: + Statement(ASTNode* parent, Optional start, Optional end) + : ASTNode(parent, start, end) + { + } +}; + +class Declaration : public Statement { + +public: + virtual bool is_declaration() const override { return true; } + virtual bool is_variable_declaration() const { return false; } + virtual bool is_parameter() const { return false; } + virtual bool is_struct_or_class() const { return false; } + +protected: + Declaration(ASTNode* parent, Optional start, Optional end) + : Statement(parent, start, end) + { + } +}; + +class InvalidDeclaration : public Declaration { + +public: + virtual ~InvalidDeclaration() override = default; + virtual const char* class_name() const override { return "InvalidDeclaration"; } + InvalidDeclaration(ASTNode* parent, Optional start, Optional end) + : Declaration(parent, start, end) + { + } +}; + +class FunctionDeclaration : public Declaration { +public: + virtual ~FunctionDeclaration() override = default; + virtual const char* class_name() const override { return "FunctionDeclaration"; } + virtual void dump(size_t indent) const override; + const StringView& name() const { return m_name; } + RefPtr definition() { return m_definition; } + + FunctionDeclaration(ASTNode* parent, Optional start, Optional end) + : Declaration(parent, start, end) + { + } + + virtual NonnullRefPtrVector declarations() const override; + + StringView m_name; + RefPtr m_return_type; + NonnullRefPtrVector m_parameters; + RefPtr m_definition; +}; + +class VariableOrParameterDeclaration : public Declaration { +public: + virtual ~VariableOrParameterDeclaration() override = default; + virtual bool is_variable_or_parameter_declaration() const override { return true; } + + StringView m_name; + RefPtr m_type; + +protected: + VariableOrParameterDeclaration(ASTNode* parent, Optional start, Optional end) + : Declaration(parent, start, end) + { + } +}; + +class Parameter : public VariableOrParameterDeclaration { +public: + virtual ~Parameter() override = default; + virtual const char* class_name() const override { return "Parameter"; } + virtual void dump(size_t indent) const override; + + Parameter(ASTNode* parent, Optional start, Optional end, StringView name) + : VariableOrParameterDeclaration(parent, start, end) + { + m_name = name; + } + + virtual bool is_parameter() const override { return true; } +}; + +class Type : public ASTNode { +public: + virtual ~Type() override = default; + virtual const char* class_name() const override { return "Type"; } + const StringView& name() const { return m_name; } + virtual void dump(size_t indent) const override; + + Type(ASTNode* parent, Optional start, Optional end, StringView name) + : ASTNode(parent, start, end) + , m_name(name) + { + } + + StringView m_name; +}; + +class Pointer : public Type { +public: + virtual ~Pointer() override = default; + virtual const char* class_name() const override { return "Pointer"; } + virtual void dump(size_t indent) const override; + + Pointer(ASTNode* parent, Optional start, Optional end) + : Type(parent, start, end, {}) + { + } + + RefPtr m_pointee; +}; + +class FunctionDefinition : public ASTNode { +public: + virtual ~FunctionDefinition() override = default; + virtual const char* class_name() const override { return "FunctionDefinition"; } + NonnullRefPtrVector& statements() { return m_statements; } + virtual void dump(size_t indent) const override; + + FunctionDefinition(ASTNode* parent, Optional start, Optional end) + : ASTNode(parent, start, end) + { + } + + virtual NonnullRefPtrVector declarations() const override; + + NonnullRefPtrVector m_statements; +}; + +class InvalidStatement : public Statement { +public: + virtual ~InvalidStatement() override = default; + virtual const char* class_name() const override { return "InvalidStatement"; } + InvalidStatement(ASTNode* parent, Optional start, Optional end) + : Statement(parent, start, end) + { + } +}; + +class Expression : public Statement { +public: + virtual ~Expression() override = default; + virtual const char* class_name() const override { return "Expression"; } + +protected: + Expression(ASTNode* parent, Optional start, Optional end) + : Statement(parent, start, end) + { + } +}; + +class InvalidExpression : public Expression { +public: + virtual ~InvalidExpression() override = default; + virtual const char* class_name() const override { return "InvalidExpression"; } + InvalidExpression(ASTNode* parent, Optional start, Optional end) + : Expression(parent, start, end) + { + } +}; + +class VariableDeclaration : public VariableOrParameterDeclaration { +public: + virtual ~VariableDeclaration() override = default; + virtual const char* class_name() const override { return "VariableDeclaration"; } + virtual void dump(size_t indent) const override; + + VariableDeclaration(ASTNode* parent, Optional start, Optional end) + : VariableOrParameterDeclaration(parent, start, end) + { + } + + virtual bool is_variable_declaration() const override { return true; } + + RefPtr m_initial_value; +}; + +class Identifier : public Expression { +public: + virtual ~Identifier() override = default; + virtual const char* class_name() const override { return "Identifier"; } + virtual void dump(size_t indent) const override; + + Identifier(ASTNode* parent, Optional start, Optional end, StringView name) + : Expression(parent, start, end) + , m_name(name) + { + } + Identifier(ASTNode* parent, Optional start, Optional end) + : Identifier(parent, start, end, {}) + { + } + + virtual bool is_identifier() const override { return true; } + + StringView m_name; +}; + +class NumericLiteral : public Expression { +public: + virtual ~NumericLiteral() override = default; + virtual const char* class_name() const override { return "NumricLiteral"; } + virtual void dump(size_t indent) const override; + + NumericLiteral(ASTNode* parent, Optional start, Optional end, StringView value) + : Expression(parent, start, end) + , m_value(value) + { + } + + StringView m_value; +}; + +class BooleanLiteral : public Expression { +public: + virtual ~BooleanLiteral() override = default; + virtual const char* class_name() const override { return "BooleanLiteral"; } + virtual void dump(size_t indent) const override; + + BooleanLiteral(ASTNode* parent, Optional start, Optional end, bool value) + : Expression(parent, start, end) + , m_value(value) + { + } + + bool m_value; +}; + +enum class BinaryOp { + Addition, + Subtraction, + Multiplication, + Division, + Modulo, + GreaterThan, + GreaterThanEquals, + LessThan, + LessThanEquals, + BitwiseAnd, + BitwiseOr, + BitwiseXor, + LeftShift, + RightShift, +}; + +class BinaryExpression : public Expression { +public: + BinaryExpression(ASTNode* parent, Optional start, Optional end) + : Expression(parent, start, end) + { + } + + virtual ~BinaryExpression() override = default; + virtual const char* class_name() const override { return "BinaryExpression"; } + virtual void dump(size_t indent) const override; + + BinaryOp m_op; + RefPtr m_lhs; + RefPtr m_rhs; +}; + +enum class AssignmentOp { + Assignment, + AdditionAssignment, + SubtractionAssignment, +}; + +class AssignmentExpression : public Expression { +public: + AssignmentExpression(ASTNode* parent, Optional start, Optional end) + : Expression(parent, start, end) + { + } + + virtual ~AssignmentExpression() override = default; + virtual const char* class_name() const override { return "AssignmentExpression"; } + virtual void dump(size_t indent) const override; + + AssignmentOp m_op; + RefPtr m_lhs; + RefPtr m_rhs; +}; + +class FunctionCall final : public Expression { +public: + FunctionCall(ASTNode* parent, Optional start, Optional end) + : Expression(parent, start, end) + { + } + + ~FunctionCall() override = default; + virtual const char* class_name() const override { return "FunctionCall"; } + virtual void dump(size_t indent) const override; + + StringView m_name; + NonnullRefPtrVector m_arguments; +}; + +class StringLiteral final : public Expression { +public: + StringLiteral(ASTNode* parent, Optional start, Optional end) + : Expression(parent, start, end) + { + } + + ~StringLiteral() override = default; + virtual const char* class_name() const override { return "StringLiteral"; } + virtual void dump(size_t indent) const override; + + StringView m_value; +}; + +class ReturnStatement : public Statement { +public: + virtual ~ReturnStatement() override = default; + virtual const char* class_name() const override { return "ReturnStatement"; } + + ReturnStatement(ASTNode* parent, Optional start, Optional end) + : Statement(parent, start, end) + { + } + virtual void dump(size_t indent) const override; + + RefPtr m_value; +}; + +class EnumDeclaration : public Declaration { +public: + virtual ~EnumDeclaration() override = default; + virtual const char* class_name() const override { return "EnumDeclaration"; } + virtual void dump(size_t indent) const override; + + EnumDeclaration(ASTNode* parent, Optional start, Optional end) + : Declaration(parent, start, end) + { + } + + StringView m_name; + Vector m_entries; +}; + +class MemberDeclaration : public Declaration { +public: + virtual ~MemberDeclaration() override = default; + virtual const char* class_name() const override { return "MemberDeclaration"; } + virtual void dump(size_t indent) const override; + + MemberDeclaration(ASTNode* parent, Optional start, Optional end) + : Declaration(parent, start, end) + { + } + + RefPtr m_type; + StringView m_name; + RefPtr m_initial_value; +}; + +class StructOrClassDeclaration : public Declaration { +public: + virtual ~StructOrClassDeclaration() override = default; + virtual const char* class_name() const override { return "StructOrClassDeclaration"; } + virtual void dump(size_t indent) const override; + virtual bool is_struct_or_class() const override { return true; } + + enum class Type { + Struct, + Class + }; + + StructOrClassDeclaration(ASTNode* parent, Optional start, Optional end, StructOrClassDeclaration::Type type) + : Declaration(parent, start, end) + , m_type(type) + { + } + + StructOrClassDeclaration::Type m_type; + StringView m_name; + NonnullRefPtrVector m_members; +}; + +enum class UnaryOp { + Invalid, + BitwiseNot, + Not, + Plus, + Minus, + PlusPlus, +}; + +class UnaryExpression : public Expression { +public: + UnaryExpression(ASTNode* parent, Optional start, Optional end) + : Expression(parent, start, end) + { + } + + virtual ~UnaryExpression() override = default; + virtual const char* class_name() const override { return "UnaryExpression"; } + virtual void dump(size_t indent) const override; + + UnaryOp m_op; + RefPtr m_lhs; +}; + +class MemberExpression : public Expression { +public: + MemberExpression(ASTNode* parent, Optional start, Optional end) + : Expression(parent, start, end) + { + } + + virtual ~MemberExpression() override = default; + virtual const char* class_name() const override { return "MemberExpression"; } + virtual void dump(size_t indent) const override; + virtual bool is_member_expression() const override { return true; } + + RefPtr m_object; + RefPtr m_property; +}; + +class ForStatement : public Statement { +public: + ForStatement(ASTNode* parent, Optional start, Optional end) + : Statement(parent, start, end) + { + } + + virtual ~ForStatement() override = default; + virtual const char* class_name() const override { return "ForStatement"; } + virtual void dump(size_t indent) const override; + + virtual NonnullRefPtrVector declarations() const override; + + RefPtr m_init; + RefPtr m_test; + RefPtr m_update; + RefPtr m_body; +}; + +class BlockStatement final : public Statement { +public: + BlockStatement(ASTNode* parent, Optional start, Optional end) + : Statement(parent, start, end) + { + } + + virtual ~BlockStatement() override = default; + virtual const char* class_name() const override { return "BlockStatement"; } + virtual void dump(size_t indent) const override; + + virtual NonnullRefPtrVector declarations() const override; + + NonnullRefPtrVector m_statements; +}; + +class Comment final : public Statement { +public: + Comment(ASTNode* parent, Optional start, Optional end) + : Statement(parent, start, end) + { + } + + virtual ~Comment() override = default; + virtual const char* class_name() const override { return "Comment"; } +}; +} diff --git a/Userland/Libraries/LibCpp/CMakeLists.txt b/Userland/Libraries/LibCpp/CMakeLists.txt index f9e022bddd9..73eaf2bb4db 100644 --- a/Userland/Libraries/LibCpp/CMakeLists.txt +++ b/Userland/Libraries/LibCpp/CMakeLists.txt @@ -1,5 +1,7 @@ set(SOURCES + AST.cpp Lexer.cpp + Parser.cpp ) serenity_lib(LibCpp cpp) diff --git a/Userland/Libraries/LibCpp/Lexer.cpp b/Userland/Libraries/LibCpp/Lexer.cpp index 831822e5a56..5273f87dd41 100644 --- a/Userland/Libraries/LibCpp/Lexer.cpp +++ b/Userland/Libraries/LibCpp/Lexer.cpp @@ -581,12 +581,13 @@ Vector Lexer::lex() commit_token(Token::Type::IncludePath); begin_token(); } + } else { + while (peek() && peek() != '\n') + consume(); + + commit_token(Token::Type::PreprocessorStatement); } - while (peek() && peek() != '\n') - consume(); - - commit_token(Token::Type::PreprocessorStatement); continue; } if (ch == '/' && peek(1) == '/') { @@ -786,4 +787,17 @@ Vector Lexer::lex() return tokens; } +bool Position::operator<(const Position& other) const +{ + return line < other.line || (line == other.line && column < other.column); +} +bool Position::operator>(const Position& other) const +{ + return !(*this < other) && !(*this == other); +} +bool Position::operator==(const Position& other) const +{ + return line == other.line && column == other.column; +} + } diff --git a/Userland/Libraries/LibCpp/Lexer.h b/Userland/Libraries/LibCpp/Lexer.h index 351dee8aa54..3e7188f9aef 100644 --- a/Userland/Libraries/LibCpp/Lexer.h +++ b/Userland/Libraries/LibCpp/Lexer.h @@ -96,11 +96,16 @@ namespace Cpp { __TOKEN(Float) \ __TOKEN(Keyword) \ __TOKEN(KnownType) \ - __TOKEN(Identifier) + __TOKEN(Identifier) \ + __TOKEN(EOF_TOKEN) struct Position { - size_t line; - size_t column; + size_t line { 0 }; + size_t column { 0 }; + + bool operator<(const Position&) const; + bool operator>(const Position&) const; + bool operator==(const Position&) const; }; struct Token { @@ -110,9 +115,9 @@ struct Token { #undef __TOKEN }; - const char* to_string() const + static const char* type_to_string(Type t) { - switch (m_type) { + switch (t) { #define __TOKEN(x) \ case Type::x: \ return #x; @@ -122,6 +127,14 @@ struct Token { ASSERT_NOT_REACHED(); } + const char* to_string() const + { + return type_to_string(m_type); + } + Position start() const { return m_start; } + Position end() const { return m_end; } + Type type() const { return m_type; } + Type m_type { Type::Unknown }; Position m_start; Position m_end; diff --git a/Userland/Libraries/LibCpp/Parser.cpp b/Userland/Libraries/LibCpp/Parser.cpp new file mode 100644 index 00000000000..a3447d1b617 --- /dev/null +++ b/Userland/Libraries/LibCpp/Parser.cpp @@ -0,0 +1,1000 @@ +/* + * Copyright (c) 2021, Itamar S. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// #define CPP_DEBUG + +#ifdef CPP_DEBUG +# define DEBUG_SPAM +#endif + +#include "Parser.h" +#include "AK/LogStream.h" +#include "AST.h" +#include +#include +#include + +namespace Cpp { + +Parser::Parser(const StringView& program) + : m_program(program) + , m_lines(m_program.split_view("\n", true)) +{ + Lexer lexer(m_program); + for (auto& token : lexer.lex()) { + if (token.m_type == Token::Type::Whitespace) + continue; + m_tokens.append(move(token)); + } +#ifdef CPP_DEBUG + dbgln("Program:"); + dbgln("{}", m_program); + dbgln("Tokens:"); + for (auto& token : m_tokens) { + dbgln("{}", token.to_string()); + } +#endif +} + +NonnullRefPtr Parser::parse() +{ + SCOPE_LOGGER(); + auto unit = create_root_ast_node(m_tokens.first().m_start, m_tokens.last().m_end); + while (!done()) { + if (match_comment()) { + consume(Token::Type::Comment); + continue; + } + + if (match_preprocessor()) { + consume_preprocessor(); + continue; + } + + auto declaration = match_declaration(); + if (declaration.has_value()) { + unit->append(parse_declaration(*unit, declaration.value())); + continue; + } + + error("unexpected token"); + consume(); + } + return unit; +} + +Optional Parser::match_declaration() +{ + switch (m_state.context) { + case Context::InTranslationUnit: + return match_declaration_in_translation_unit(); + case Context::InFunctionDefinition: + return match_declaration_in_function_definition(); + default: + error("unexpected context"); + return {}; + } +} + +NonnullRefPtr Parser::parse_declaration(ASTNode& parent, DeclarationType declaration_type) +{ + switch (declaration_type) { + case DeclarationType::Function: + return parse_function_declaration(parent); + case DeclarationType::Variable: + return parse_variable_declaration(parent); + case DeclarationType::Enum: + return parse_enum_declaration(parent); + case DeclarationType::Struct: + return parse_struct_or_class_declaration(parent, StructOrClassDeclaration::Type::Struct); + default: + error("unexpected declaration type"); + return create_ast_node(parent, position(), position()); + } +} + +NonnullRefPtr Parser::parse_function_declaration(ASTNode& parent) +{ + auto func = create_ast_node(parent, position(), {}); + + auto return_type_token = consume(Token::Type::KnownType); + auto function_name = consume(Token::Type::Identifier); + consume(Token::Type::LeftParen); + auto parameters = parse_parameter_list(*func); + consume(Token::Type::RightParen); + + RefPtr body; + Position func_end {}; + if (peek(Token::Type::LeftCurly).has_value()) { + body = parse_function_definition(*func); + func_end = body->end(); + } else { + func_end = position(); + consume(Token::Type::Semicolon); + } + + func->m_name = text_of_token(function_name); + func->m_return_type = create_ast_node(*func, return_type_token.m_start, return_type_token.m_end, text_of_token(return_type_token)); + if (parameters.has_value()) + func->m_parameters = move(parameters.value()); + func->m_definition = move(body); + func->set_end(func_end); + return func; +} + +NonnullRefPtr Parser::parse_function_definition(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto func = create_ast_node(parent, position(), {}); + consume(Token::Type::LeftCurly); + while (!eof() && peek().m_type != Token::Type::RightCurly) { + func->statements().append(parse_statement(func)); + } + func->set_end(position()); + if (!eof()) + consume(Token::Type::RightCurly); + return func; +} + +NonnullRefPtr Parser::parse_statement(ASTNode& parent) +{ + SCOPE_LOGGER(); + ArmedScopeGuard consume_semicolumn([this]() { + consume(Token::Type::Semicolon); + }); + + if (match_block_statement()) { + consume_semicolumn.disarm(); + return parse_block_statement(parent); + } + if (match_comment()) { + consume_semicolumn.disarm(); + return parse_comment(parent); + } + if (match_variable_declaration()) { + return parse_variable_declaration(parent); + } + if (match_expression()) { + return parse_expression(parent); + } + if (match_keyword("return")) { + return parse_return_statement(parent); + } + if (match_keyword("for")) { + consume_semicolumn.disarm(); + return parse_for_statement(parent); + } else { + error("unexpected statement type"); + consume_semicolumn.disarm(); + consume(); + return create_ast_node(parent, position(), position()); + } +} + +NonnullRefPtr Parser::parse_comment(ASTNode& parent) +{ + auto comment = create_ast_node(parent, position(), {}); + consume(Token::Type::Comment); + comment->set_end(position()); + return comment; +} + +bool Parser::match_block_statement() +{ + return peek().type() == Token::Type::LeftCurly; +} + +NonnullRefPtr Parser::parse_block_statement(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto block_statement = create_ast_node(parent, position(), {}); + consume(Token::Type::LeftCurly); + while (peek().type() != Token::Type::RightCurly) { + block_statement->m_statements.append(parse_statement(*block_statement)); + } + consume(Token::Type::RightCurly); + block_statement->set_end(position()); + return block_statement; +} + +bool Parser::match_variable_declaration() +{ + save_state(); + ScopeGuard state_guard = [this] { load_state(); }; + + if (!peek(Token::Type::KnownType).has_value() && !peek(Token::Type::Identifier).has_value()) + return false; + consume(); + + if (!peek(Token::Type::Identifier).has_value()) + return false; + consume(); + + if (match(Token::Type::Equals)) { + consume(Token::Type::Equals); + if (!match_expression()) { + error("initial value of variable is not an expression"); + return false; + } + } + + return true; +} + +NonnullRefPtr Parser::parse_variable_declaration(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto var = create_ast_node(parent, position(), {}); + auto type_token = consume(); + if (type_token.type() != Token::Type::KnownType && type_token.type() != Token::Type::Identifier) { + error("unexpected token for variable type"); + var->set_end(type_token.end()); + return var; + } + auto identifier_token = consume(Token::Type::Identifier); + RefPtr initial_value; + + if (match(Token::Type::Equals)) { + consume(Token::Type::Equals); + initial_value = parse_expression(var); + } + + var->set_end(position()); + var->m_type = create_ast_node(var, type_token.m_start, type_token.m_end, text_of_token(type_token)); + var->m_name = text_of_token(identifier_token); + var->m_initial_value = move(initial_value); + + return var; +} + +NonnullRefPtr Parser::parse_expression(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto expression = parse_primary_expression(parent); + // TODO: remove eof() logic, should still work without it + if (eof() || match(Token::Type::Semicolon)) { + return expression; + } + + NonnullRefPtrVector secondary_expressions; + + while (match_secondary_expression()) { + // FIXME: Handle operator precedence + expression = parse_secondary_expression(parent, expression); + secondary_expressions.append(expression); + } + + for (size_t i = 0; secondary_expressions.size() != 0 && i < secondary_expressions.size() - 1; ++i) { + secondary_expressions[i].set_parent(secondary_expressions[i + 1]); + } + + return expression; +} + +bool Parser::match_secondary_expression() +{ + auto type = peek().type(); + return type == Token::Type::Plus + || type == Token::Type::PlusEquals + || type == Token::Type::Minus + || type == Token::Type::MinusEquals + || type == Token::Type::Asterisk + || type == Token::Type::AsteriskEquals + || type == Token::Type::Percent + || type == Token::Type::PercentEquals + || type == Token::Type::Equals + || type == Token::Type::Greater + || type == Token::Type::Greater + || type == Token::Type::Less + || type == Token::Type::LessEquals + || type == Token::Type::Dot + || type == Token::Type::PlusPlus + || type == Token::Type::MinusMinus + || type == Token::Type::And + || type == Token::Type::AndEquals + || type == Token::Type::Pipe + || type == Token::Type::PipeEquals + || type == Token::Type::Caret + || type == Token::Type::CaretEquals + || type == Token::Type::LessLess + || type == Token::Type::LessLessEquals + || type == Token::Type::GreaterGreater + || type == Token::Type::GreaterGreaterEquals + || type == Token::Type::AndAnd + || type == Token::Type::PipePipe; +} + +NonnullRefPtr Parser::parse_primary_expression(ASTNode& parent) +{ + SCOPE_LOGGER(); + // TODO: remove eof() logic, should still work without it + if (eof()) { + auto node = create_ast_node(parent, position(), position()); + return node; + } + + if (match_unary_expression()) + return parse_unary_expression(parent); + + if (match_literal()) { + return parse_literal(parent); + } + switch (peek().type()) { + case Token::Type::Identifier: { + if (match_function_call()) + return parse_function_call(parent); + auto token = consume(); + return create_ast_node(parent, token.m_start, token.m_end, text_of_token(token)); + } + default: { + error("could not parse primary expression"); + auto token = consume(); + return create_ast_node(parent, token.m_start, token.m_end); + } + } +} + +bool Parser::match_literal() +{ + switch (peek().type()) { + case Token::Type::Integer: + return true; + case Token::Type::DoubleQuotedString: + return true; + case Token::Type::Keyword: { + return match_boolean_literal(); + } + default: + return false; + } +} + +bool Parser::match_unary_expression() +{ + auto type = peek().type(); + return type == Token::Type::PlusPlus + || type == Token::Type::MinusMinus + || type == Token::Type::ExclamationMark + || type == Token::Type::Tilde + || type == Token::Type::Plus + || type == Token::Type::Minus; +} + +NonnullRefPtr Parser::parse_unary_expression(ASTNode& parent) +{ + auto unary_exp = create_ast_node(parent, position(), {}); + auto op_token = consume(); + UnaryOp op { UnaryOp::Invalid }; + switch (op_token.type()) { + case Token::Type::Minus: + op = UnaryOp::Minus; + break; + case Token::Type::Plus: + op = UnaryOp::Plus; + break; + case Token::Type::ExclamationMark: + op = UnaryOp::Not; + break; + case Token::Type::Tilde: + op = UnaryOp::BitwiseNot; + break; + case Token::Type::PlusPlus: + op = UnaryOp::PlusPlus; + break; + default: + break; + } + unary_exp->m_op = op; + auto lhs = parse_expression(*unary_exp); + unary_exp->m_lhs = lhs; + unary_exp->set_end(lhs->end()); + return unary_exp; +} + +NonnullRefPtr Parser::parse_literal(ASTNode& parent) +{ + switch (peek().type()) { + case Token::Type::Integer: { + auto token = consume(); + return create_ast_node(parent, token.m_start, token.m_end, text_of_token(token)); + } + case Token::Type::DoubleQuotedString: { + return parse_string_literal(parent); + } + case Token::Type::Keyword: { + if (match_boolean_literal()) + return parse_boolean_literal(parent); + [[fallthrough]]; + } + default: { + error("could not parse literal"); + auto token = consume(); + return create_ast_node(parent, token.m_start, token.m_end); + } + } +} + +NonnullRefPtr Parser::parse_secondary_expression(ASTNode& parent, NonnullRefPtr lhs) +{ + SCOPE_LOGGER(); + switch (peek().m_type) { + case Token::Type::Plus: + return parse_binary_expression(parent, lhs, BinaryOp::Addition); + case Token::Type::Less: + return parse_binary_expression(parent, lhs, BinaryOp::LessThan); + case Token::Type::Equals: + return parse_assignment_expression(parent, lhs, AssignmentOp::Assignment); + case Token::Type::Dot: { + consume(); + auto exp = create_ast_node(parent, lhs->start(), {}); + lhs->set_parent(*exp); + exp->m_object = move(lhs); + auto property_token = consume(Token::Type::Identifier); + exp->m_property = create_ast_node(*exp, property_token.start(), property_token.end(), text_of_token(property_token)); + exp->set_end(property_token.end()); + return exp; + } + default: { + error(String::formatted("unexpected operator for expression. operator: {}", peek().to_string())); + auto token = consume(); + return create_ast_node(parent, token.start(), token.end()); + } + } +} + +NonnullRefPtr Parser::parse_binary_expression(ASTNode& parent, NonnullRefPtr lhs, BinaryOp op) +{ + consume(); // Operator + auto exp = create_ast_node(parent, lhs->start(), {}); + lhs->set_parent(*exp); + exp->m_op = op; + exp->m_lhs = move(lhs); + auto rhs = parse_expression(exp); + exp->set_end(rhs->end()); + exp->m_rhs = move(rhs); + return exp; +} + +NonnullRefPtr Parser::parse_assignment_expression(ASTNode& parent, NonnullRefPtr lhs, AssignmentOp op) +{ + consume(); // Operator + auto exp = create_ast_node(parent, lhs->start(), {}); + lhs->set_parent(*exp); + exp->m_op = op; + exp->m_lhs = move(lhs); + auto rhs = parse_expression(exp); + exp->set_end(rhs->end()); + exp->m_rhs = move(rhs); + return exp; +} + +Optional Parser::match_declaration_in_translation_unit() +{ + if (match_function_declaration()) + return DeclarationType::Function; + if (match_enum_declaration()) + return DeclarationType::Enum; + if (match_struct_declaration()) + return DeclarationType::Struct; + return {}; +} + +bool Parser::match_enum_declaration() +{ + return peek().type() == Token::Type::Keyword && text_of_token(peek()) == "enum"; +} + +bool Parser::match_struct_declaration() +{ + return peek().type() == Token::Type::Keyword && text_of_token(peek()) == "struct"; +} + +bool Parser::match_function_declaration() +{ + save_state(); + ScopeGuard state_guard = [this] { load_state(); }; + + if (!peek(Token::Type::KnownType).has_value()) + return false; + consume(); + + if (!peek(Token::Type::Identifier).has_value()) + return false; + consume(); + + if (!peek(Token::Type::LeftParen).has_value()) + return false; + consume(); + + while (consume().m_type != Token::Type::RightParen && !eof()) { }; + + if (peek(Token::Type::Semicolon).has_value() || peek(Token::Type::LeftCurly).has_value()) + return true; + + return false; +} + +Optional> Parser::parse_parameter_list(ASTNode& parent) +{ + SCOPE_LOGGER(); + NonnullRefPtrVector parameters; + while (peek().m_type != Token::Type::RightParen && !eof()) { + auto type = parse_type(parent); + + auto name_identifier = peek(Token::Type::Identifier); + if (name_identifier.has_value()) + consume(Token::Type::Identifier); + + StringView name; + if (name_identifier.has_value()) + name = text_of_token(name_identifier.value()); + + auto param = create_ast_node(parent, type->start(), name_identifier.has_value() ? name_identifier.value().m_end : type->end(), name); + + param->m_type = move(type); + parameters.append(move(param)); + if (peek(Token::Type::Comma).has_value()) + consume(Token::Type::Comma); + } + return parameters; +} + +bool Parser::match_comment() +{ + return match(Token::Type::Comment); +} + +bool Parser::match_whitespace() +{ + return match(Token::Type::Whitespace); +} + +bool Parser::match_preprocessor() +{ + return match(Token::Type::PreprocessorStatement) || match(Token::Type::IncludeStatement); +} + +void Parser::consume_preprocessor() +{ + SCOPE_LOGGER(); + switch (peek().type()) { + case Token::Type::PreprocessorStatement: + consume(); + break; + case Token::Type::IncludeStatement: + consume(); + consume(Token::Type::IncludePath); + break; + default: + error("unexpected token while parsing preprocessor statement"); + consume(); + } +} + +Optional Parser::consume_whitespace() +{ + SCOPE_LOGGER(); + return consume(Token::Type::Whitespace); +} + +Token Parser::consume(Token::Type type) +{ + auto token = consume(); + if (token.type() != type) + error(String::formatted("expected {} at {}:{}, found: {}", Token::type_to_string(type), token.start().line, token.start().column, Token::type_to_string(token.type()))); + return token; +} + +bool Parser::match(Token::Type type) +{ + return peek().m_type == type; +} + +Token Parser::consume() +{ + if (eof()) { + error("C++ Parser: out of tokens"); + return { Token::Type::EOF_TOKEN, position(), position() }; + } + return m_tokens[m_state.token_index++]; +} + +Token Parser::peek() const +{ + if (eof()) { + return { Token::Type::EOF_TOKEN, position(), position() }; + } + return m_tokens[m_state.token_index]; +} + +Optional Parser::peek(Token::Type type) const +{ + auto token = peek(); + if (token.m_type == type) + return token; + return {}; +} + +void Parser::save_state() +{ + m_saved_states.append(m_state); +} + +void Parser::load_state() +{ + m_state = m_saved_states.take_last(); +} + +Optional Parser::match_declaration_in_function_definition() +{ + ASSERT_NOT_REACHED(); +} + +bool Parser::done() +{ + return m_state.token_index == m_tokens.size(); +} + +StringView Parser::text_of_token(const Cpp::Token& token) +{ + ASSERT(token.m_start.line == token.m_end.line); + ASSERT(token.m_start.column <= token.m_end.column); + return m_lines[token.m_start.line].substring_view(token.m_start.column, token.m_end.column - token.m_start.column + 1); +} + +StringView Parser::text_of_node(const ASTNode& node) const +{ + if (node.start().line == node.end().line) { + ASSERT(node.start().column <= node.end().column); + return m_lines[node.start().line].substring_view(node.start().column, node.end().column - node.start().column + 1); + } + + auto index_of_position([this](auto position) { + size_t start_index = 0; + for (size_t line = 0; line < position.line; ++line) { + start_index += m_lines[line].length() + 1; + } + start_index += position.column; + return start_index; + }); + auto start_index = index_of_position(node.start()); + auto end_index = index_of_position(node.end()); + ASSERT(end_index >= start_index); + return m_program.substring_view(start_index, end_index - start_index); +} + +void Parser::error(StringView message) +{ + SCOPE_LOGGER(); + if (message.is_null() || message.is_empty()) + message = ""; + String formatted_message; + if (m_state.token_index >= m_tokens.size()) { + formatted_message = String::formatted("C++ Parsed error on EOF.{}", message); + } else { + formatted_message = String::formatted("C++ Parser error: {}. token: {} ({}:{})", + message, + m_state.token_index < m_tokens.size() ? text_of_token(m_tokens[m_state.token_index]) : "EOF", + m_tokens[m_state.token_index].m_start.line, + m_tokens[m_state.token_index].m_start.column); + } + m_errors.append(formatted_message); + dbgln("{}", formatted_message); +} + +bool Parser::match_expression() +{ + auto token_type = peek().m_type; + return token_type == Token::Type::Integer + || token_type == Token::Type::Float + || token_type == Token::Type::Identifier + || match_unary_expression(); +} + +bool Parser::eof() const +{ + return m_state.token_index >= m_tokens.size(); +} + +Position Parser::position() const +{ + if (eof()) + return m_tokens.last().m_end; + return peek().m_start; +} + +RefPtr Parser::eof_node() const +{ + ASSERT(m_tokens.size()); + return node_at(m_tokens.last().m_end); +} + +RefPtr Parser::node_at(Position pos) const +{ + ASSERT(!m_tokens.is_empty()); + RefPtr match_node; + for (auto& node : m_nodes) { + if (node.start() > pos || node.end() < pos) + continue; + if (!match_node) + match_node = node; + else if (node_span_size(node) < node_span_size(*match_node)) + match_node = node; + } + return match_node; +} + +Optional Parser::token_at(Position pos) const +{ + for (auto& token : m_tokens) { + if (token.start() > pos || token.end() < pos) + continue; + return token; + } + return {}; +} + +size_t Parser::node_span_size(const ASTNode& node) const +{ + if (node.start().line == node.end().line) + return node.end().column - node.start().column; + + size_t span_size = m_lines[node.start().line].length() - node.start().column; + for (size_t line = node.start().line + 1; line < node.end().line; ++line) { + span_size += m_lines[line].length(); + } + return span_size + m_lines[node.end().line].length() - node.end().column; +} + +void Parser::print_tokens() const +{ + for (auto& token : m_tokens) { + dbgln("{}", token.to_string()); + } +} + +bool Parser::match_function_call() +{ + save_state(); + ScopeGuard state_guard = [this] { load_state(); }; + if (!match(Token::Type::Identifier)) + return false; + consume(); + return match(Token::Type::LeftParen); +} + +NonnullRefPtr Parser::parse_function_call(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto call = create_ast_node(parent, position(), {}); + auto name_identifier = consume(Token::Type::Identifier); + call->m_name = text_of_token(name_identifier); + + NonnullRefPtrVector args; + consume(Token::Type::LeftParen); + while (peek().type() != Token::Type::RightParen && !eof()) { + args.append(parse_expression(*call)); + if (peek().type() == Token::Type::Comma) + consume(Token::Type::Comma); + } + consume(Token::Type::RightParen); + call->m_arguments = move(args); + call->set_end(position()); + return call; +} + +NonnullRefPtr Parser::parse_string_literal(ASTNode& parent) +{ + SCOPE_LOGGER(); + Optional start_token_index; + Optional end_token_index; + while (!eof()) { + auto token = peek(); + if (token.type() != Token::Type::DoubleQuotedString && token.type() != Token::Type::EscapeSequence) { + ASSERT(start_token_index.has_value()); + // TODO: don't consume + end_token_index = m_state.token_index - 1; + break; + } + if (!start_token_index.has_value()) + start_token_index = m_state.token_index; + consume(); + } + ASSERT(start_token_index.has_value()); + ASSERT(end_token_index.has_value()); + Token start_token = m_tokens[start_token_index.value()]; + Token end_token = m_tokens[end_token_index.value()]; + ASSERT(start_token.start().line == end_token.start().line); + + auto text = m_lines[start_token.start().line].substring_view(start_token.start().column, end_token.end().column - start_token.start().column + 1); + auto string_literal = create_ast_node(parent, start_token.start(), end_token.end()); + string_literal->m_value = text; + return string_literal; +} + +NonnullRefPtr Parser::parse_return_statement(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto return_statement = create_ast_node(parent, position(), {}); + consume(Token::Type::Keyword); + auto expression = parse_expression(*return_statement); + return_statement->m_value = expression; + return_statement->set_end(expression->end()); + return return_statement; +} + +NonnullRefPtr Parser::parse_enum_declaration(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto enum_decl = create_ast_node(parent, position(), {}); + consume_keyword("enum"); + auto name_token = consume(Token::Type::Identifier); + enum_decl->m_name = text_of_token(name_token); + consume(Token::Type::LeftCurly); + while (peek().type() != Token::Type::RightCurly && !eof()) { + enum_decl->m_entries.append(text_of_token(consume(Token::Type::Identifier))); + if (peek().type() != Token::Type::Comma) { + break; + } + consume(Token::Type::Comma); + } + consume(Token::Type::RightCurly); + consume(Token::Type::Semicolon); + enum_decl->set_end(position()); + return enum_decl; +} + +Token Parser::consume_keyword(const String& keyword) +{ + auto token = consume(); + if (token.type() != Token::Type::Keyword) { + error(String::formatted("unexpected token: {}, expected Keyword", token.to_string())); + return token; + } + if (text_of_token(token) != keyword) { + error(String::formatted("unexpected keyword: {}, expected {}", text_of_token(token), keyword)); + return token; + } + return token; +} + +bool Parser::match_keyword(const String& keyword) +{ + auto token = peek(); + if (token.type() != Token::Type::Keyword) { + return false; + } + if (text_of_token(token) != keyword) { + return false; + } + return true; +} + +NonnullRefPtr Parser::parse_struct_or_class_declaration(ASTNode& parent, StructOrClassDeclaration::Type type) +{ + SCOPE_LOGGER(); + auto decl = create_ast_node(parent, position(), {}, type); + switch (type) { + case StructOrClassDeclaration::Type::Struct: + consume_keyword("struct"); + break; + case StructOrClassDeclaration::Type::Class: + consume_keyword("class"); + break; + } + auto name_token = consume(Token::Type::Identifier); + decl->m_name = text_of_token(name_token); + + consume(Token::Type::LeftCurly); + + while (peek().type() != Token::Type::RightCurly && !eof()) { + decl->m_members.append(parse_member_declaration(*decl)); + } + + consume(Token::Type::RightCurly); + consume(Token::Type::Semicolon); + decl->set_end(position()); + return decl; +} + +NonnullRefPtr Parser::parse_member_declaration(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto member_decl = create_ast_node(parent, position(), {}); + auto type_token = consume(); + auto identifier_token = consume(Token::Type::Identifier); + RefPtr initial_value; + + if (match(Token::Type::LeftCurly)) { + consume(Token::Type::LeftCurly); + initial_value = parse_expression(*member_decl); + consume(Token::Type::RightCurly); + } + + member_decl->m_type = create_ast_node(*member_decl, type_token.m_start, type_token.m_end, text_of_token(type_token)); + member_decl->m_name = text_of_token(identifier_token); + member_decl->m_initial_value = move(initial_value); + consume(Token::Type::Semicolon); + member_decl->set_end(position()); + + return member_decl; +} + +NonnullRefPtr Parser::parse_boolean_literal(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto token = consume(Token::Type::Keyword); + auto text = text_of_token(token); + // text == "true" || text == "false"; + bool value = (text == "true"); + return create_ast_node(parent, token.start(), token.end(), value); +} + +bool Parser::match_boolean_literal() +{ + auto token = peek(); + if (token.type() != Token::Type::Keyword) + return false; + auto text = text_of_token(token); + return text == "true" || text == "false"; +} + +NonnullRefPtr Parser::parse_type(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto token = consume(); + auto type = create_ast_node(parent, token.start(), token.end(), text_of_token(token)); + if (token.type() != Token::Type::KnownType && token.type() != Token::Type::Identifier) { + error(String::formatted("unexpected token for type: {}", token.to_string())); + return type; + } + while (peek().type() == Token::Type::Asterisk) { + auto asterisk = consume(); + auto ptr = create_ast_node(type, asterisk.start(), asterisk.end()); + ptr->m_pointee = type; + type = ptr; + } + return type; +} + +NonnullRefPtr Parser::parse_for_statement(ASTNode& parent) +{ + SCOPE_LOGGER(); + auto for_statement = create_ast_node(parent, position(), {}); + consume(Token::Type::Keyword); + consume(Token::Type::LeftParen); + for_statement->m_init = parse_variable_declaration(*for_statement); + consume(Token::Type::Semicolon); + for_statement->m_test = parse_expression(*for_statement); + consume(Token::Type::Semicolon); + for_statement->m_update = parse_expression(*for_statement); + consume(Token::Type::RightParen); + for_statement->m_body = parse_statement(*for_statement); + for_statement->set_end(for_statement->m_body->end()); + return for_statement; +} + +} diff --git a/Userland/Libraries/LibCpp/Parser.h b/Userland/Libraries/LibCpp/Parser.h new file mode 100644 index 00000000000..013d1ff6972 --- /dev/null +++ b/Userland/Libraries/LibCpp/Parser.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021, Itamar S. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "AK/NonnullRefPtr.h" +#include "AST.h" +#include + +namespace Cpp { + +class Parser final { +public: + explicit Parser(const StringView&); + ~Parser() = default; + + NonnullRefPtr parse(); + bool eof() const; + + RefPtr eof_node() const; + RefPtr node_at(Position) const; + Optional token_at(Position) const; + RefPtr root_node() const { return m_root_node; } + StringView text_of_node(const ASTNode&) const; + void print_tokens() const; + Vector errors() const { return m_errors; } + +private: + enum class DeclarationType { + Function, + Variable, + Enum, + Struct, + }; + + bool done(); + + Optional match_declaration(); + Optional match_declaration_in_translation_unit(); + Optional match_declaration_in_function_definition(); + bool match_function_declaration(); + bool match_comment(); + bool match_preprocessor(); + bool match_whitespace(); + bool match_variable_declaration(); + bool match_expression(); + bool match_function_call(); + bool match_secondary_expression(); + bool match_enum_declaration(); + bool match_struct_declaration(); + bool match_literal(); + bool match_unary_expression(); + bool match_boolean_literal(); + bool match_keyword(const String&); + bool match_block_statement(); + + Optional> parse_parameter_list(ASTNode& parent); + Optional consume_whitespace(); + void consume_preprocessor(); + + NonnullRefPtr parse_declaration(ASTNode& parent, DeclarationType); + NonnullRefPtr parse_function_declaration(ASTNode& parent); + NonnullRefPtr parse_function_definition(ASTNode& parent); + NonnullRefPtr parse_statement(ASTNode& parent); + NonnullRefPtr parse_variable_declaration(ASTNode& parent); + NonnullRefPtr parse_expression(ASTNode& parent); + NonnullRefPtr parse_primary_expression(ASTNode& parent); + NonnullRefPtr parse_secondary_expression(ASTNode& parent, NonnullRefPtr lhs); + NonnullRefPtr parse_function_call(ASTNode& parent); + NonnullRefPtr parse_string_literal(ASTNode& parent); + NonnullRefPtr parse_return_statement(ASTNode& parent); + NonnullRefPtr parse_enum_declaration(ASTNode& parent); + NonnullRefPtr parse_struct_or_class_declaration(ASTNode& parent, StructOrClassDeclaration::Type); + NonnullRefPtr parse_member_declaration(ASTNode& parent); + NonnullRefPtr parse_literal(ASTNode& parent); + NonnullRefPtr parse_unary_expression(ASTNode& parent); + NonnullRefPtr parse_boolean_literal(ASTNode& parent); + NonnullRefPtr parse_type(ASTNode& parent); + NonnullRefPtr parse_binary_expression(ASTNode& parent, NonnullRefPtr lhs, BinaryOp); + NonnullRefPtr parse_assignment_expression(ASTNode& parent, NonnullRefPtr lhs, AssignmentOp); + NonnullRefPtr parse_for_statement(ASTNode& parent); + NonnullRefPtr parse_block_statement(ASTNode& parent); + NonnullRefPtr parse_comment(ASTNode& parent); + + bool match(Token::Type); + Token consume(Token::Type); + Token consume(); + Token consume_keyword(const String&); + Token peek() const; + Optional peek(Token::Type) const; + Position position() const; + + void save_state(); + void load_state(); + + enum class Context { + InTranslationUnit, + InFunctionDefinition, + }; + + struct State { + Context context { Context::InTranslationUnit }; + size_t token_index { 0 }; + }; + + StringView text_of_token(const Cpp::Token& token); + void error(StringView message = {}); + + size_t node_span_size(const ASTNode& node) const; + + template + NonnullRefPtr + create_ast_node(ASTNode& parent, const Position& start, Optional end, Args&&... args) + { + auto node = adopt(*new T(&parent, start, end, forward(args)...)); + m_nodes.append(node); + return node; + } + + NonnullRefPtr + create_root_ast_node(const Position& start, Position end) + { + auto node = adopt(*new TranslationUnit(nullptr, start, end)); + m_nodes.append(node); + m_root_node = node; + return node; + } + + StringView m_program; + Vector m_lines; + Vector m_tokens; + State m_state; + Vector m_saved_states; + RefPtr m_root_node; + NonnullRefPtrVector m_nodes; + Vector m_errors; +}; + +} diff --git a/Userland/Utilities/CMakeLists.txt b/Userland/Utilities/CMakeLists.txt index 9352734b1bc..cea432a38d6 100644 --- a/Userland/Utilities/CMakeLists.txt +++ b/Userland/Utilities/CMakeLists.txt @@ -47,3 +47,4 @@ target_link_libraries(test-web LibWeb) target_link_libraries(tt LibPthread) target_link_libraries(grep LibRegex) target_link_libraries(gunzip LibCompress) +target_link_libraries(CppParserTest LibCpp) diff --git a/Userland/Utilities/CppParserTest.cpp b/Userland/Utilities/CppParserTest.cpp new file mode 100644 index 00000000000..2fd78329aa3 --- /dev/null +++ b/Userland/Utilities/CppParserTest.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2021, the SerenityOS developers. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "AK/Forward.h" +#include "LibCpp/AST.h" +#include +#include +#include + +int main(int argc, char** argv) +{ + Core::ArgsParser args_parser; + const char* path = nullptr; + bool tokens_mode = false; + args_parser.add_option(tokens_mode, "Print Tokens", "tokens", 'T'); + args_parser.add_positional_argument(path, "Cpp File", "cpp-file", Core::ArgsParser::Required::No); + args_parser.parse(argc, argv); + + if (!path) + path = "Source/little/main.cpp"; + auto file = Core::File::construct(path); + if (!file->open(Core::IODevice::ReadOnly)) { + perror("open"); + exit(1); + } + auto content = file->read_all(); + StringView content_view(content); + ::Cpp::Parser parser(content_view); + if (tokens_mode) { + parser.print_tokens(); + return 0; + } + auto root = parser.parse(); + + dbgln("Parser errors:"); + for (auto& error : parser.errors()) { + dbgln("{}", error); + } + + root->dump(0); +}