{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Dassault Systemes SE']\n",
      "['Schneider Electric SE']\n",
      "['China Electronics Huada Technology Co. Ltd.']\n",
      "['Kingsoft Cloud Holdings Ltd. ADR']\n",
      "['Baidu Inc.']\n",
      "['Kingdee International Software Group Co. Ltd.']\n",
      "['Tencent Holdings Ltd.']\n",
      "['Xiaomi Corp.']\n",
      "['AsiaInfo Technologies Ltd.']\n",
      "['Fanuc Corp.']\n",
      "['Omron Corp.']\n",
      "['Mitsubishi Electric Corp.']\n",
      "['Hexagon AB Series B']\n",
      "['Hollysys Automation Technologies Ltd.']\n",
      "['JD.com Inc. ADR']\n",
      "['Autodesk Inc.']\n",
      "['Altair Engineering Inc. Cl A']\n",
      "['Ansys Inc.']\n",
      "['Honeywell International Inc.']\n",
      "['PTC Inc.']\n",
      "['Texas Instruments Inc.']\n",
      "['Cadence Design Systems Inc.']\n",
      "['Cisco Systems Inc.']\n",
      "['Microsoft Corp.']\n",
      "['Analog Devices Inc.']\n",
      "['Amazon.com Inc.']\n",
      "['Intel Corp.']\n",
      "['ABB Ltd. ADR']\n",
      "['International Business Machines Corp.']\n",
      "['Oracle Corp.']\n",
      "['Salesforce Inc.']\n",
      "['SAP SE ADR']\n",
      "['Emerson Electric Co.']\n",
      "['Dell Technologies Inc. Cl C']\n",
      "['Hewlett Packard Enterprise Co.']\n",
      "['Rockwell Automation Inc.']\n",
      "['General Electric Co.']\n",
      "['Synopsys Inc.']\n",
      "['STMicroelectronics N.V.']\n",
      "['Alibaba Group Holding Ltd. ADR']\n",
      "['Siemens AG']\n",
      "['Infineon Technologies AG']\n"
     ]
    }
   ],
   "source": [
    "from urllib.request import urlopen\n",
    "from bs4 import BeautifulSoup\n",
    "import time\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "select_year = 2021\n",
    "Firm = pd.read_csv(\"Firm.csv\", index_col=0)\n",
    "select_firm = Firm.loc[Firm[\"Source\"]==\"marketwatch\",:\"Type_Region\"]\n",
    "select_firm[\"Exchange\"] = None\n",
    "select_firm[\"Crawled_Name\"] = None\n",
    "select_firm[\"Stock_Currency\"] = None\n",
    "\n",
    "for code, row in select_firm.iterrows():\n",
    "    try:\n",
    "        url = f\"https://www.marketwatch.com/investing/stock/{row['Stock_Code']}/financials/balance-sheet\"\n",
    "        if row['Stock_Region'] != \"US\":\n",
    "            url += f\"?countrycode={row['Stock_Region']}\"\n",
    "        html = urlopen(url)\n",
    "        time.sleep(0.1)\n",
    "        bsObj = BeautifulSoup(html.read(), 'html.parser')\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        print(\"Can't open url.\")\n",
    "    else:\n",
    "        basic_info = bsObj.find(name=\"script\", attrs={'type':\"application/ld+json\"}).contents[0]\n",
    "        basic_info = json.loads(basic_info.replace('\\n','').replace('\\t',''))\n",
    "        select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Exchange\"] = basic_info[\"exchange\"]\n",
    "        select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Crawled_Name\"] = basic_info[\"name\"]\n",
    "        select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Stock_Currency\"] = basic_info[\"priceCurrency\"]\n",
    "\n",
    "        bs_table = bsObj.find(name=\"table\", attrs={\"aria-label\":\"Financials - Assets data table\"})\n",
    "        if bs_table is not None:\n",
    "            bs = bs_table.find(name=\"tr\", attrs={\"class\":\"table__row\"}).find_all(lambda tag: tag.name == 'th' and \n",
    "                                            tag.get('class') == ['overflow__heading'])\n",
    "            years_list = [i.div.contents[0] for i in bs][0:-1]\n",
    "            years_list = list(map(int, years_list))\n",
    "            bs = bs_table.find_all(name=\"tr\", attrs={\"class\":\"table__row is-highlighted\"})\n",
    "            for i in bs:\n",
    "                if i.td.div.contents[0] == 'Total Assets':\n",
    "                    string = i.find(\"div\", attrs={\"class\":\"chart--financials js-financial-chart\"}).attrs['data-chart-data']\n",
    "                    total_assets_list = string.split(',')\n",
    "                    total_assets_list = list(map(lambda x: float(x) if x!='' else 0, total_assets_list))\n",
    "            total_assets = pd.DataFrame({\"year\":years_list,'total_asset':total_assets_list})\n",
    "            select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Assets\"] = total_assets.loc[total_assets['year']==select_year,'total_asset'].to_list()[0]\n",
    "    try:\n",
    "        url = f\"https://www.marketwatch.com/investing/stock/{row['Stock_Code']}/financials\"\n",
    "        if row['Stock_Region'] != \"US\":\n",
    "            url += f\"?countrycode={row['Stock_Region']}\"\n",
    "        html = urlopen(url)\n",
    "        time.sleep(0.1)\n",
    "        bsObj = BeautifulSoup(html.read(), 'html.parser')\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        print(\"Can't open url.\")\n",
    "    else:\n",
    "        basic_info = bsObj.find(name=\"script\", attrs={'type':\"application/ld+json\"}).contents[0]\n",
    "        basic_info = json.loads(basic_info.replace('\\n','').replace('\\t',''))\n",
    "        select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Exchange\"] = basic_info[\"exchange\"]\n",
    "        select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Crawled_Name\"] = basic_info[\"name\"]\n",
    "        select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Stock_Currency\"] = basic_info[\"priceCurrency\"]\n",
    "\n",
    "        bs_table = bsObj.find(name=\"table\", attrs={\"aria-label\":\"Financials - data table\"})\n",
    "        if bs_table is not None:\n",
    "            bs = bs_table.find(name=\"tr\", attrs={\"class\":\"table__row\"}).find_all(lambda tag: tag.name == 'th' and \n",
    "                                            tag.get('class') == ['overflow__heading'])\n",
    "            years_list = [i.div.contents[0] for i in bs][0:-1]\n",
    "            years_list = list(map(int, years_list))\n",
    "            bs = bs_table.find_all(name=\"tr\", attrs={\"class\":\"table__row is-highlighted\"})\n",
    "            for i in bs:\n",
    "                if i.td.div.contents[0] == 'Sales/Revenue':\n",
    "                    string = i.find(\"div\", attrs={\"class\":\"chart--financials js-financial-chart\"}).attrs['data-chart-data']\n",
    "                    total_revenue_list = string.split(',')\n",
    "                    total_revenue_list = list(map(lambda x: float(x) if x!='' else 0, total_revenue_list))\n",
    "            total_revenue = pd.DataFrame({\"year\":years_list,'total_revenue':total_revenue_list})\n",
    "            select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Revenue\"] = total_revenue.loc[total_revenue['year']==select_year,'total_revenue'].to_list()[0]\n",
    "        print(select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Crawled_Name\"].to_list())\n",
    "\n",
    "select_firm['Exchange_Rate'] = select_firm['Stock_Currency'].map({'EUR': 7.2505, 'HKD': 0.88, 'JPY': 0.051, 'SEK': 0.65, 'USD': 6.88})\n",
    "select_firm['Assets_zh'] = select_firm['Assets'] * select_firm['Exchange_Rate'] * 0.00000001\n",
    "select_firm['Revenue_zh'] = select_firm['Revenue'] * select_firm['Exchange_Rate'] * 0.00000001\n",
    "select_firm.to_csv('MarketWatch.csv', index=False, encoding='utf-8-sig')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "bcdafc093860683ffb58d6956591562b7f8ed5d58147d17d71a5d4d6605a08df"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}