{ "cells": [ { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Dassault Systemes SE']\n", "['Schneider Electric SE']\n", "['China Electronics Huada Technology Co. Ltd.']\n", "['Kingsoft Cloud Holdings Ltd. ADR']\n", "['Baidu Inc.']\n", "['Kingdee International Software Group Co. Ltd.']\n", "['Tencent Holdings Ltd.']\n", "['Xiaomi Corp.']\n", "['AsiaInfo Technologies Ltd.']\n", "['Fanuc Corp.']\n", "['Omron Corp.']\n", "['Mitsubishi Electric Corp.']\n", "['Hexagon AB Series B']\n", "['Hollysys Automation Technologies Ltd.']\n", "['JD.com Inc. ADR']\n", "['Autodesk Inc.']\n", "['Altair Engineering Inc. Cl A']\n", "['Ansys Inc.']\n", "['Honeywell International Inc.']\n", "['PTC Inc.']\n", "['Texas Instruments Inc.']\n", "['Cadence Design Systems Inc.']\n", "['Cisco Systems Inc.']\n", "['Microsoft Corp.']\n", "['Analog Devices Inc.']\n", "['Amazon.com Inc.']\n", "['Intel Corp.']\n", "['ABB Ltd. ADR']\n", "['International Business Machines Corp.']\n", "['Oracle Corp.']\n", "['Salesforce Inc.']\n", "['SAP SE ADR']\n", "['Emerson Electric Co.']\n", "['Dell Technologies Inc. Cl C']\n", "['Hewlett Packard Enterprise Co.']\n", "['Rockwell Automation Inc.']\n", "['General Electric Co.']\n", "['Synopsys Inc.']\n", "['STMicroelectronics N.V.']\n", "['Alibaba Group Holding Ltd. ADR']\n", "['Siemens AG']\n", "['Infineon Technologies AG']\n" ] } ], "source": [ "from urllib.request import urlopen\n", "from bs4 import BeautifulSoup\n", "import time\n", "import json\n", "import pandas as pd\n", "\n", "select_year = 2021\n", "Firm = pd.read_csv(\"Firm.csv\", index_col=0)\n", "select_firm = Firm.loc[Firm[\"Source\"]==\"marketwatch\",:\"Type_Region\"]\n", "select_firm[\"Exchange\"] = None\n", "select_firm[\"Crawled_Name\"] = None\n", "select_firm[\"Stock_Currency\"] = None\n", "\n", "for code, row in select_firm.iterrows():\n", " try:\n", " url = f\"https://www.marketwatch.com/investing/stock/{row['Stock_Code']}/financials/balance-sheet\"\n", " if row['Stock_Region'] != \"US\":\n", " url += f\"?countrycode={row['Stock_Region']}\"\n", " html = urlopen(url)\n", " time.sleep(0.1)\n", " bsObj = BeautifulSoup(html.read(), 'html.parser')\n", " except Exception as e:\n", " print(e)\n", " print(\"Can't open url.\")\n", " else:\n", " basic_info = bsObj.find(name=\"script\", attrs={'type':\"application/ld+json\"}).contents[0]\n", " basic_info = json.loads(basic_info.replace('\\n','').replace('\\t',''))\n", " select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Exchange\"] = basic_info[\"exchange\"]\n", " select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Crawled_Name\"] = basic_info[\"name\"]\n", " select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Stock_Currency\"] = basic_info[\"priceCurrency\"]\n", "\n", " bs_table = bsObj.find(name=\"table\", attrs={\"aria-label\":\"Financials - Assets data table\"})\n", " if bs_table is not None:\n", " bs = bs_table.find(name=\"tr\", attrs={\"class\":\"table__row\"}).find_all(lambda tag: tag.name == 'th' and \n", " tag.get('class') == ['overflow__heading'])\n", " years_list = [i.div.contents[0] for i in bs][0:-1]\n", " years_list = list(map(int, years_list))\n", " bs = bs_table.find_all(name=\"tr\", attrs={\"class\":\"table__row is-highlighted\"})\n", " for i in bs:\n", " if i.td.div.contents[0] == 'Total Assets':\n", " string = i.find(\"div\", attrs={\"class\":\"chart--financials js-financial-chart\"}).attrs['data-chart-data']\n", " total_assets_list = string.split(',')\n", " total_assets_list = list(map(lambda x: float(x) if x!='' else 0, total_assets_list))\n", " total_assets = pd.DataFrame({\"year\":years_list,'total_asset':total_assets_list})\n", " select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Assets\"] = total_assets.loc[total_assets['year']==select_year,'total_asset'].to_list()[0]\n", " try:\n", " url = f\"https://www.marketwatch.com/investing/stock/{row['Stock_Code']}/financials\"\n", " if row['Stock_Region'] != \"US\":\n", " url += f\"?countrycode={row['Stock_Region']}\"\n", " html = urlopen(url)\n", " time.sleep(0.1)\n", " bsObj = BeautifulSoup(html.read(), 'html.parser')\n", " except Exception as e:\n", " print(e)\n", " print(\"Can't open url.\")\n", " else:\n", " basic_info = bsObj.find(name=\"script\", attrs={'type':\"application/ld+json\"}).contents[0]\n", " basic_info = json.loads(basic_info.replace('\\n','').replace('\\t',''))\n", " select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Exchange\"] = basic_info[\"exchange\"]\n", " select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Crawled_Name\"] = basic_info[\"name\"]\n", " select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Stock_Currency\"] = basic_info[\"priceCurrency\"]\n", "\n", " bs_table = bsObj.find(name=\"table\", attrs={\"aria-label\":\"Financials - data table\"})\n", " if bs_table is not None:\n", " bs = bs_table.find(name=\"tr\", attrs={\"class\":\"table__row\"}).find_all(lambda tag: tag.name == 'th' and \n", " tag.get('class') == ['overflow__heading'])\n", " years_list = [i.div.contents[0] for i in bs][0:-1]\n", " years_list = list(map(int, years_list))\n", " bs = bs_table.find_all(name=\"tr\", attrs={\"class\":\"table__row is-highlighted\"})\n", " for i in bs:\n", " if i.td.div.contents[0] == 'Sales/Revenue':\n", " string = i.find(\"div\", attrs={\"class\":\"chart--financials js-financial-chart\"}).attrs['data-chart-data']\n", " total_revenue_list = string.split(',')\n", " total_revenue_list = list(map(lambda x: float(x) if x!='' else 0, total_revenue_list))\n", " total_revenue = pd.DataFrame({\"year\":years_list,'total_revenue':total_revenue_list})\n", " select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Revenue\"] = total_revenue.loc[total_revenue['year']==select_year,'total_revenue'].to_list()[0]\n", " print(select_firm.loc[select_firm[\"Stock_Code\"]==row['Stock_Code'], \"Crawled_Name\"].to_list())\n", "\n", "select_firm['Exchange_Rate'] = select_firm['Stock_Currency'].map({'EUR': 7.2505, 'HKD': 0.88, 'JPY': 0.051, 'SEK': 0.65, 'USD': 6.88})\n", "select_firm['Assets_zh'] = select_firm['Assets'] * select_firm['Exchange_Rate'] * 0.00000001\n", "select_firm['Revenue_zh'] = select_firm['Revenue'] * select_firm['Exchange_Rate'] * 0.00000001\n", "select_firm.to_csv('MarketWatch.csv', index=False, encoding='utf-8-sig')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "bcdafc093860683ffb58d6956591562b7f8ed5d58147d17d71a5d4d6605a08df" } } }, "nbformat": 4, "nbformat_minor": 2 }